Ejemplo n.º 1
0
def processMain():
    ## @params: [JOB_NAME]
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)
    ds0 = glueContext.create_dynamic_frame.from_catalog(database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="ds0")
    rc1 = ResolveChoice.apply(frame=ds0, choice="make_struct", transformation_ctx="rc1")

    ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="redshift",
                                                       connection_options={
                                                           "url": "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world",
                                                           "dbtable": "atn.gluetable211"},
                                                       transformation_ctx="ds3")
    ds7 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="s3",
                                                       connection_options={
                                                           "path": "s3://asgqatestautomation4/Targetdata211"},
                                                       format="json", transformation_ctx="ds7")

    # ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="postgresql",
    #                                                    connection_options={
    #                                                        "url": "jdbc:postgresql://decheqaperf01v.asg.com:5432/postgres",
    #                                                        "dbtable": "public.gluetable26"},
    #                                                    transformation_ctx="ds3")
    job.commit()
Ejemplo n.º 2
0
    def _get_spark_context():
        spark_context = GlueContext(SparkContext.getOrCreate())
        job = Job(spark_context)
        args = _get_arguments({})
        job.init(args['JOB_NAME'], args)

        return (spark_context, job)
Ejemplo n.º 3
0
def main(
    input_path, output_path, conn_type, module_name, function_name, glue_config=None
):
    """
    There is a main function of Glue job driver. The idea is to keep it generic to be able to reuse for running
    multiple Glue jobs. Driver is responsible to validate input parameters and invoke a proper Glue job. It does not
    contain any specific business logic.

    @param input_path: input path contains input files to the job
    @param output_path: output path used to write Glue job results
    @param conn_type: use 'file' to read files from a local file system or 's3' for reading files from S3
    @param module_name and function_name: are used to call Glue job using reflection
    @param glue_config: optional additional Glue configuration parameters

    Parameters example: --JOB_NAME=movie_analytics_job --CONN_TYPE=file --INPUT_PATH=/input --OUTPUT_PATH=/results --MODULE_NAME=tasks --FUNCTION_NAME=create_and_stage_average_rating
    """
    try:
        glue_context = create_glue_context()

        module = importlib.import_module(module_name)
        module.init_glue(glue_context)
        module.init_spark(glue_context.spark_session)
        f = getattr(module, function_name)
        f(input_path, output_path, conn_type)

        job = Job(glue_context)
        job.init(args["JOB_NAME"], args)

        job.commit()

    except Exception as e:
        sys.error(
            f"Couldn't execute job due to {str(e)} for module: {module_name} and function: {function_name}"
        )
        raise e
Ejemplo n.º 4
0
def processMain():
    ## @params: [JOB_NAME]
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)
    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database="autoglues3lineage",
        table_name="train_sm_s2adb_csv",
        transformation_ctx="datasource0")
    resolvechoice2 = ResolveChoice.apply(frame=datasource0,
                                         choice="make_struct",
                                         transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(
        frame=resolvechoice2, transformation_ctx="dropnullfields3")
    datasink4 = glueContext.write_dynamic_frame.from_options(
        frame=dropnullfields3,
        connection_type="s3",
        connection_options={"path": "s3://asgqatestautomation4/Targetdata1A"},
        format="csv",
        transformation_ctx="datasink4")
    datasink5 = glueContext.write_dynamic_frame.from_options(
        frame=dropnullfields3,
        connection_type="s3",
        connection_options={"path": "s3://asgqatestautomation4/Targetdata1B"},
        format="parquet",
        transformation_ctx="datasink5")
    job.commit()
Ejemplo n.º 5
0
def create_glue_context():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    job = Job(glueContext)
    args = getResolvedOptions(sys.argv, ['JOB_NAME', 'input'])
    job.init(args['JOB_NAME'], args)

    return spark, job
Ejemplo n.º 6
0
def get_spark_session_and_glue_job(
    glue_args: Dict,
    conf=None,
    py_files: Iterable[str] = None,
    extra_jars: List[str] = None,
):
    """
    Get spark session and AWS glue job.

    Parameters
    ----------
    glue_args
        Dictionary of Argument Name: Argument value
    extra_jars
        Path to dependent jar files
    conf : Union[pyspark.SparkConf, Dict[str, str]]
        Spark config, either object or dictionary of config options.
    py_files
        Paths to python files (.py, .zip, .egg)

    Returns
    -------
    pyspark.sql.SparkSession, awsglue.job.Job

    """
    from awsglue.context import GlueContext
    from awsglue.job import Job
    from pyspark import SparkContext, SparkConf

    LOGGER.debug("Creating spark session with parameters")
    LOGGER.debug("conf=%s", conf)
    LOGGER.debug("py_files=%s", py_files)
    LOGGER.debug("extra_jars=%s", extra_jars)
    if isinstance(conf, dict):
        spark_conf = SparkConf()
        spark_conf.setAll(conf.items())
    elif isinstance(conf, SparkConf):
        spark_conf = conf
    else:
        spark_conf = None

    if extra_jars and spark_conf:
        spark_dependencies = ",".join(extra_jars)
        spark_conf.set("spark.jars.packages", spark_dependencies)

    sc = SparkContext.getOrCreate(conf=spark_conf)

    if py_files:
        LOGGER.debug("Adding PYFILEs: %s", py_files)
        for py_file in py_files:
            sc.addPyFile(py_file)

    glue_context = GlueContext(sparkContext=sc)
    job = Job(glue_context=glue_context)
    job.init(glue_args["JOB_NAME"], glue_args)

    # .py, .zip or .egg
    return glue_context.spark_session, job
def main(args, yesterday):
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    database = args['database_name']
    table_name = '{0}kinesis_data'.format(args['table_prefix'])

    partition_predicate = \
        'partition_0="{0}" and partition_1="{1:02}" and partition_2="{2:02}"'.format(
            yesterday.year, yesterday.month, yesterday.day)

    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database=database,
        table_name=table_name,
        push_down_predicate=partition_predicate,
        transformation_ctx='datasource0')

    if datasource0.count() > 0:
        applymapping1 = ApplyMapping.apply(frame=datasource0,
                                           mappings=[('device_id', 'string',
                                                      'device_id', 'string'),
                                                     ('timestamp', 'long',
                                                      'timestamp', 'long')],
                                           transformation_ctx='applymapping1')

        df = applymapping1.toDF()
        dev = df.drop_duplicates(['device_id'])
        device_list = dev.collect()
        result_list = []

        for device in device_list:
            device_id = device['device_id']
            df2 = df[df.device_id == device_id]
            df_list = df2.sort('timestamp', ascending=True).collect()

            result_list.extend(create_diff_timestamp_list(device_id, df_list))

        df = spark.createDataFrame(result_list)
        dyf = DynamicFrame.fromDF(df, glueContext, 'dyf')

        # Hive format
        output_path = os.path.join(args['target_bucket'],
                                   'year={0}'.format(yesterday.year),
                                   'month={0:02}'.format(yesterday.month),
                                   'day={0:02}'.format(yesterday.day))

        glueContext.write_dynamic_frame.from_options(
            frame=dyf,
            connection_type='s3',
            connection_options={'path': output_path},
            format='parquet',
            transformation_ctx='datasink2')

    job.commit()
Ejemplo n.º 8
0
def _get_spark_session_and_glue_job(glue_args):
    from awsglue.context import GlueContext
    from awsglue.job import Job

    sc = SparkContext.getOrCreate()
    glue_context = GlueContext(sparkContext=sc)
    job = Job(glue_context=glue_context)
    job.init(glue_args["JOB_NAME"], glue_args)
    return glue_context.spark_session, job
def create_glue_context():
    """ Creates Glue Context / Spark session """
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    return spark, job
Ejemplo n.º 10
0
def main():
    console("Starting Job")

    ## @params: [job_name]
    # 1. Start Glue Context
    glueContext = GlueContext(SparkContext.getOrCreate())

    # 2. Initialize Job
    job = Job(glueContext)
    args = get_args()
    job.init(args["JOB_NAME"], args)

    client = boto3.client("glue", region_name="us-east-1")

    Tables = client.get_tables(DatabaseName=args["athena_database"])
    tableList = Tables["TableList"]

    # 3. Clear bucket contents
    console(f"Excluding S3 files for: {args['target_bucket']}")
    s3 = boto3.resource("s3")
    bucket = s3.Bucket(args["target_bucket"])
    bucket.objects.delete()

    for table in tableList:
        tableName = table["Name"]
        if re.search(f"[0-9]+", tableName):
            # Ignoring Athena Tables already processed...
            continue
        else:
            console("Processing Table {}".format(tableName))

        # 4. Load dynamic dataframe
        datasource0 = glueContext.create_dynamic_frame.from_catalog(
            database=args["athena_database"],
            table_name=tableName,
            transformation_ctx="datasource0",
        )
        # 5. Process Dynamo database
        # Drop null fields
        dropnullfields1 = DropNullFields.apply(
            frame=datasource0, transformation_ctx="dropnullfields1")

        # Save Dynamic Frame on S3 using Glue
        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields1,
            connection_type="s3",
            connection_options={
                "path": "s3://{}/{}/".format(args["target_bucket"], tableName)
            },
            format="parquet",
            transformation_ctx="datasink2",
        )
    job.commit()
Ejemplo n.º 11
0
def main():
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE',
        'ANONYMIZATION_CONF'
    ])
    sc = SparkContext()

    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"

    # avoiding spark creates $folders$ in S3
    hadoop_conf = sc._jsc.hadoopConfiguration()
    # hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session

    # Needed to reliably work between parquet, spark, and pandas dataframes.
    spark.conf.set("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1")
    spark.conf.set("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1")

    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    input_conf_str_json = args["ANONYMIZATION_CONF"]

    print('input_conf_str_json:', input_conf_str_json)
    anonymization_conf_dict = json.loads(input_conf_str_json)
    print('anonimization_conf_dict:', anonymization_conf_dict)
    position_references_in_conf = {}
    for i in range(len(anonymization_conf_dict['datasets'])):
        position_references_in_conf[anonymization_conf_dict['datasets'][i]
                                    ['table']] = i
    print('position_references_in_conf:', position_references_in_conf)

    dataLakeBucket = args["DL_BUCKET"]
    dataLakePrefix = args["DL_PREFIX"]
    aws_region = args["DL_REGION"]

    glue_database = args["GLUE_SRC_DATABASE"]
    process(job, dataLakeBucket, dataLakePrefix, anonymization_conf_dict,
            position_references_in_conf, glueContext, glue_database,
            aws_region)
    delete_temporary_s3_objects(dataLakeBucket, aws_region)
Ejemplo n.º 12
0
def run():
    """
    Run method.

    This method is the starting point of the actual logic of the Glue job
    """
    args = getResolvedOptions(sys.argv, ['JOB_NAME', 's3_bucket', 'job_id'])
    bucket = args['s3_bucket']
    job_id = args['job_id']

    glue_context = GlueContext(SparkContext.getOrCreate())
    spark = glue_context.spark_session
    job = Job(glue_context)
    job.init(args['JOB_NAME'], args)

    aggregate_key = 's3://' + bucket + '/output/' + job_id + '/rnorm-aggregate.json'
    aggregate_dataframe = spark.read.json(aggregate_key)

    basics_key = 's3://' + bucket + '/input/title.basics.tsv.gz'
    basics_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(basics_key)
    basics_dataframe = basics_dataframe.toDF("tconstTmp", "titleType", "primaryTitle", "originalTitle", "isAdult",
                                             "startYear", "endYear", "runtimeMinutes", "genres")
    aggregate_dataframe = aggregate_dataframe.join(basics_dataframe,
                                                   aggregate_dataframe.tconst == basics_dataframe.tconstTmp,
                                                   how='left').drop('tconstTmp')

    ratings_key = 's3://' + bucket + '/input/title.ratings.tsv.gz'
    ratings_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(ratings_key)
    ratings_dataframe = ratings_dataframe.toDF("tconstTmp", "averageRating", "numVotes")
    aggregate_dataframe = aggregate_dataframe.join(ratings_dataframe,
                                                   aggregate_dataframe.tconst == ratings_dataframe.tconstTmp,
                                                   how='left').drop('tconstTmp')

    aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'ynorm')
    aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp90')
    aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp10')

    results_folder = 's3://' + bucket + '/athena/job_id=' + job_id + '/'
    aggregate_dataframe.repartition(10).write.parquet(results_folder, mode="overwrite")

    job.commit()
Ejemplo n.º 13
0
def main():
    ## @params: [JOB_NAME]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE'
    ])

    sc = SparkContext()
    # avoiding spark creates $folders$ in S3
    hadoop_conf = sc._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs",
                    "false")
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    dataLakeBucket = args["DL_BUCKET"]
    dataLakePrefix = args["DL_PREFIX"]
    aws_region = args["DL_REGION"]
    glue_database = args["GLUE_SRC_DATABASE"]
    process(job, glueContext, glue_database, dataLakeBucket, dataLakePrefix,
            aws_region)
from awsglue.job import Job
import time
import pg8000
import boto3
import re
from decimal import *
import extract_rs_query_logs_functions as functions


## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME','REGION','CLUSTER_ENDPOINT'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job_configs={}
job_configs.update(args)
clusterId= re.search('jdbc:redshift://(.+?)\..*',args['CLUSTER_ENDPOINT']).group(1)
job_configs.update(functions.getJobConfigurations(clusterId,job_configs))
job_configs['CLUSTER_ID']=clusterId
tempDir=args['TempDir']
s3Prefix=job_configs['s3_prefix']
credentials=boto3.Session().get_credentials()
job_configs['aws_access_key_id'] = credentials.access_key
job_configs['aws_secret_access_key'] = credentials.secret_key
job_configs['aws_session_token'] = credentials.token
job_configs.update(args)
job_configs['jdbcURL']="{}?user={}&password={}".format(args['CLUSTER_ENDPOINT'],job_configs['user'],job_configs['password'])
job_configs['region_name']=boto3.session.Session().region_name
job_configs['spark_session']=spark
Ejemplo n.º 15
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'partition_column', 'output_bucket_name',
        'primary_key', 'parallelism', 'date_column', 'datetime_column'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    date_column = args['date_column']
    datetime_column = args['datetime_column']
    hudi_primary_key = args['primary_key']
    output_bucket_name = args['output_bucket_name']
    parallelism = args['parallelism']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    spark = SparkSession.builder.config(
        'spark.serializer',
        'org.apache.spark.serializer.KryoSerializer').getOrCreate()
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)
    logger = glue_context.get_logger()
    logger.info('Initialization.')

    # Initialization of Glue client to connect to Glue Catalog and retrieve table information
    glueClient = boto3.client('glue')

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    isTableExists = False
    try:
        glueClient.get_table(DatabaseName=clean_db_name,
                             Name=target_entity_name)
        isTableExists = True
        logger.info(clean_db_name + '.' + target_entity_name + ' exists.')
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            isTableExists = False
            logger.info(clean_db_name + '.' + target_entity_name +
                        ' does not exist. Table will be created.')

    partition_path = '' if partition_column == 'None' else partition_column

    common_config = {
        'className':
        'org.apache.hudi',
        'hoodie.datasource.hive_sync.use_jdbc':
        'false',
        'hoodie.index.type':
        'GLOBAL_BLOOM',
        'hoodie.datasource.write.precombine.field':
        datetime_column,
        'hoodie.datasource.write.recordkey.field':
        hudi_primary_key,
        'hoodie.table.name':
        target_entity_name,
        'hoodie.consistency.check.enabled':
        'true',
        'hoodie.datasource.hive_sync.database':
        clean_db_name,
        'hoodie.datasource.hive_sync.table':
        target_entity_name,
        'hoodie.datasource.hive_sync.enable':
        'true',
        'hoodie.datasource.write.partitionpath.field':
        partition_path,
        'hoodie.datasource.hive_sync.partition_fields':
        partition_path,
        'hoodie.datasource.hive_sync.partition_extractor_class':
        'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column
        == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor',
        'hoodie.datasource.write.hive_style_partitioning':
        'false' if partition_column == 'None' else 'true',
        'hoodie.datasource.write.keygenerator.class':
        'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column
        == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator'
    }

    incremental_config = {
        'hoodie.upsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert',
        'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS',
        'hoodie.cleaner.commits.retained': 10
    }

    initLoad_config = {
        'hoodie.bulkinsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert'
    }

    if (isTableExists):
        logger.info('Incremental upsert.')
        combinedConf = {**common_config, **incremental_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Append').save("s3://" + output_bucket_name +
                                                "/" + clean_table_name)
    else:
        logger.info('Inital load.')
        combinedConf = {**common_config, **initLoad_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Overwrite').save("s3://" +
                                                   output_bucket_name + "/" +
                                                   clean_table_name)

    job.commit()
Ejemplo n.º 16
0
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.job import Job
import sys
from awsglue.utils import getResolvedOptions
from awsglue.dynamicframe import DynamicFrame

glueContext = GlueContext(SparkContext.getOrCreate())
glueJob = Job(glueContext)
args = getResolvedOptions(sys.argv,['JOB_NAME'])

glueJob.init(args['JOB_NAME'],args)
#sparkSession = glueContext.sparkSession
spark = glueContext.spark_session

#df = sparkSession.read.csv("s3a://pkm")
#df.show()

#dfnew = spark.read.option("header","true").option("delimiter", ",").csv("s3a://pkm")
df = spark.read.option("header","true").format("csv").load("s3a://pkm
#inputGDF = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": ["s3://pkm"], "recurse":True}, format = "csv")
#df=inputGDF.toDF()
df.show(2)

dynamic_dframe = DynamicFrame.fromDF(df, glueContext, "dynamic_df")
 
##Write the DynamicFrame as a file in CSV format to a folder in an S3 bucket.
##It is possible to write to any Amazon data store (SQL Server, Redshift, etc) by using any previously defined connections.
retDatasink4 = glueContext.write_dynamic_frame.from_options(frame = dynamic_dframe, connection_type = "s3", connection_options = {"path": "s3://pkm-target"}, format = "csv", transformation_ctx = "datasink4")

glueJob.commit()
args = getResolvedOptions(sys.argv, ['BucketName'])

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
athena_client = boto3.client(service_name='athena', region_name='us-west-2')
bucket_name = args['BucketName']
BucketName = args['BucketName']

database = 'c360view_stage'

sc = SparkContext()

glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init('n1_c360_dispositions')


def run_query(client, query):
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={
            'OutputLocation': 's3://{}/athenaoutput/'.format(bucket_name)
        },
    )
    return response


def validate_query(client, query_id):
    resp = ["FAILED", "SUCCEEDED", "CANCELLED"]
Ejemplo n.º 18
0
from pyspark.sql.functions import *
from pyspark.context import SparkContext
from awsglue.transforms import *
from awsglue.context import GlueContext
from awsglue.job import Job

glue_context = GlueContext(SparkContext.getOrCreate())
spark = glue_context.spark_session
job = Job(glue_context)
job.init('ddb2s3')

dyf = glue_context.create_dynamic_frame.from_options(
    connection_type='dynamodb',
    connection_options={
        'dynamodb.input.tableName': 'NatureRemo',
        'dynamodb.splits': '72'  # Should be (DPU-1)*8
    })

dyf.printSchema()
dyf = dyf.resolveChoice(specs=[('value', 'cast:double')])
dyf = dyf.repartition(
    1)  # You need to modify this number to your preferred file number.

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
    connection_type='s3',
    connection_options={'path': "s3://example-bucket/glue/ddb2s3/"},
    format='parquet')

# You will need to specify partitionKeys if you want to write data with hive partitioning based on column 'year', 'month', and 'day
#glue_context.write_dynamic_frame_from_options(

# get the arguments
kwargs = getResolvedOptions(sys.argv, [
    'JOB_NAME', 'instruments', 'columns', 'test_weeks', 'n_tests',
    'holdout_weeks', 'length', 's3_bucket', 's3_path', 'user', 'labid'
])

kwargs['instruments'] = kwargs['instruments'].split(',')
kwargs['columns'] = kwargs['columns'].split(',')
kwargs['test_weeks'] = int(kwargs['test_weeks'])
kwargs['n_tests'] = int(kwargs['n_tests'])
kwargs['holdout_weeks'] = int(kwargs['holdout_weeks'])
kwargs['length'] = int(kwargs['length'])

# register the udf
spark.udf.register(
    "create_tests", create_tests,
    ArrayType(
        StructType([
            StructField('ts_start', TimestampType(), True),
            StructField('target', ArrayType(DoubleType()), True)
        ])))

# register the udf
spark.udf.register("to_delta", to_delta, ArrayType(DoubleType()))

job = Job(glueContext)
job.init(kwargs['JOB_NAME'], kwargs)
create_all_features(**kwargs)
job.commit()
import datetime
import boto3

args = getResolvedOptions(sys.argv, [
    'JOB_NAME', 'region', 'source_database', 'target_location', 'target_prefix'
])
job_name = args['JOB_NAME']
region = args['region']

source_db = args['source_database']
target_bucket = args['target_location']
target_prefix = args['target_prefix']
sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(job_name, args)
client = boto3.client('glue', region_name=region)

# Make this a global variable so it can't change if we run near midnight UTC
# Plus I'm not creating a new object for every record!
now = datetime.datetime.now()


def AddPartitions(rec):
    rec["yyyy"] = now.year
    rec["mm"] = now.month
    rec["day"] = now.day
    return rec


def transform(source_db,
source_creation_dtm,pohne_nbr, 
full_nam, last_nam, fe_nam, 
fe_id, fe_idn, source_name,
posting_agent, data_origin,
load_dtm, update_dtm, versn_flg, 
hr_sso, hr_nam, mgr_sso,
mgr_nam, prt_persn_d_md5, prt_persn_d_md5_scd1,
start_dtm, end_dtm, current_flag_ind,
deleted_flag_ind"""

# Initialize the GlueContext and SparkContext
SC = SparkContext()
GLUECONTEXT = GlueContext(SC)
SPARK = GLUECONTEXT.spark_session
JOB = Job(GLUECONTEXT)
JOB.init(ARGS['JOB_NAME'], ARGS)
RUN_ID = ARGS['JOB_RUN_ID']
JOB_NAME = ARGS['JOB_NAME']
TEMPDIR = ARGS['TempDir']

#CurrentTimeStamp
DATETIMESTAMP = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
END_DTM = '9999-12-31 23:59:59'
JOB_START_TIME = timeit.default_timer()

# creating DataFrames from source glue crawlers

#SRC_NOTEMPTY = True


PRT_PERSN_D_SOURCE_1 = GLUECONTEXT.create_dynamic_frame. \
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
##Use Pandas library to execute Pivot function, popular Python libraries have already been integrated by default. Please check https://docs.amazonaws.cn/en_us/glue/latest/dg/aws-glue-programming-python-libraries.html
import pandas as pd
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ['Target'])
glueContext = GlueContext(SparkContext.getOrCreate())
sc = glueContext.spark_session

job = Job(glueContext)
job.init(args['Target'], args)

#Read the table from preset database in Glue data source, change the value of database and table_name accordingly
datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database="default",
    table_name="jsonpivottest_json",
    transformation_ctx="datasource0")
datasource0.printSchema()

applymapping1 = ApplyMapping.apply(
    frame=datasource0,
    mappings=[("dimensions", "array", "dimensions", "array"),
              ("measurename", "string", "measurename", "string"),
              ("measurevalue", "string", "measurevalue", "string"),
              ("time", "string", "time", "string"),
              ("timeunit", "string", "timeunit", "string")],
Ejemplo n.º 23
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
#args = getResolvedOptions(sys.argv, ['JOB_NAME','VAL1','VAL2','VAL3','DEST_FOLDER'])
#v_list=[{"VAL1":args['rohit_name']}]
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'])

freshers_data = spark.read.format("csv").option("header", "true").option(
    "inferSchema", "true").load('s3://bucketname-report.csv.gz')
freshers_data.printSchema()
freshers_data.show(20, False)
#print('v_list')

#print(v_list)
Ejemplo n.º 24
0
class JobRunner(object):
    """This class manages the orchestration of new jobs, including parsing arguments and control flow"""
    SERVICE_DEFINITIONS = {
        'alb': [ALBRawCatalog, ALBConvertedCatalog],
        'elb': [ELBRawCatalog, ELBConvertedCatalog],
        'cloudtrail': [CloudTrailRawCatalog, CloudTrailConvertedCatalog],
        'cloudfront': [CloudFrontRawCatalog, CloudFrontConvertedCatalog],
        's3_access': [S3AccessRawCatalog, S3AccessConvertedCatalog],
        'vpc_flow': [VPCFlowRawCatalog, VPCFlowConvertedCatalog],
    }

    def __init__(self, service_name):
        args = getResolvedOptions(sys.argv, ['JOB_NAME'] + self._job_arguments())

        # Validate the service name
        if not self.is_valid_service(service_name):
            raise Exception("'%s' is not yet a supported service." % service_name)

        self.glue_context = self._init_glue_context()
        self.job = Job(self.glue_context)
        region = self.get_instance_region()

        # Create data catalog references
        raw_klas = self.SERVICE_DEFINITIONS[service_name][0]
        converted_klas = self.SERVICE_DEFINITIONS[service_name][1]

        self.raw_catalog = raw_klas(
            region,
            args['raw_database_name'],
            args['raw_table_name'],
            args['s3_source_location']
        )
        self.optimized_catalog = converted_klas(
            region,
            args['converted_database_name'],
            args['converted_table_name'],
            args['s3_converted_target']
        )

        # Assume that if the raw table does not exist, this is our first run
        self.initial_run = not self.raw_catalog.does_table_exist()

        # Create a converter object and initialize the glue job!
        self.converter = DataConverter(self.glue_context, self.raw_catalog, self.optimized_catalog)
        self.job.init(args['JOB_NAME'], args)

    @staticmethod
    def is_valid_service(service_name):
        """Determines whether the given service_name is a supported service or not"""
        return service_name in JobRunner.SERVICE_DEFINITIONS

    def get_instance_region(self):
        """Retrieve the current AWS Region from the Instance Metadata"""
        contents = urllib2.urlopen("http://169.254.169.254/latest/dynamic/instance-identity/document").read()
        return json.loads(contents).get('region')

    def create_tables_if_needed(self):
        """If this is the initial run of the Job, create both the raw and optmized tables in the Data Catalog"""
        if self.initial_run is True:
            # TODO: Fail if the table already exists, or for converted tables if the S3 path already exists
            LOGGER.info("Initial run, scanning S3 for partitions.")
            self.raw_catalog.initialize_table_from_s3()
            # Note that if the source table is partitionless, this is a null-op.
            self.optimized_catalog.initialize_with_partitions(self.raw_catalog.partitioner.build_partitions_from_s3())

    def add_new_raw_partitions(self):
        """For the raw catalog, check and see if any new partitions exist for UTC today.

        Continue this check for every day previous until we reach a day where a partition exists."""
        if self.initial_run is not True:
            LOGGER.info("Recurring run, only looking for recent partitions on raw catalog.")
            self.raw_catalog.add_recent_partitions()

    def add_new_optimized_partitions(self):
        """For the optimized catalog, check and see if any new partitions exist for UTC today.
        Continue this check for every day previous until we reach a day where a partition exists.
        
        If this is the initial run, add whatever partitions we can find.
        """
        if self.initial_run and isinstance(self.raw_catalog.partitioner, NullPartitioner):
            LOGGER.info("Initial run with source NullPartitioner, adding all partitions from S3.")
            self.optimized_catalog.get_and_create_partitions()
        else:
            self.optimized_catalog.add_recent_partitions()

    def trigger_conversion(self):
        """Trigger the DataConverter"""
        self.converter.run()

    def finish(self):
        """Take any actions necessary to finish the job"""
        self.job.commit()

    def convert_and_partition(self):
        """A wrapper for the most common operations of these jobs. This allows for a simple one-line
        interface to the consumer, but allows them to use more-specific methods if need be.
        """
        self.create_tables_if_needed()
        self.add_new_raw_partitions()
        self.trigger_conversion()
        self.add_new_optimized_partitions()
        self.finish()

    @staticmethod
    def _job_arguments():
        return [
            'raw_database_name',
            'raw_table_name',
            'converted_database_name',
            'converted_table_name',
            's3_source_location',
            's3_converted_target'
        ]

    @staticmethod
    def _init_glue_context():
        # Imports are done here so we can isolate the configuration of this job
        from awsglue.context import GlueContext
        from pyspark.context import SparkContext
        spark_context = SparkContext.getOrCreate()
        spark_context._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")  # noqa pylint: disable=protected-access
        spark_context._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false")  # noqa pylint: disable=protected-access
        return GlueContext(spark_context)
Ejemplo n.º 25
0
def main(args):
    if args.verbose:
        print("Got arguments: %s" % (args))

    glue_client = boto3.client("glue", region_name=args.region)
    s3_client = boto3.client("s3", region_name=args.region)

    # Verify source DB and table exist
    if validate_db(args.athenaDatabase,
                   glue_client=glue_client) and args.verbose:
        print("Validated source database %s exists." % (args.athenaDatabase))
    if (validate_table(
            args.athenaDatabase, args.athenaTable, glue_client=glue_client)
            and args.verbose):
        print("Validated source table %s exists." % (args.athenaTable))

    # Verify input and output buckets exist and are accessible.
    for bucket in [args.outputBucket, args.inputBucket]:
        if validate_bucket(bucket, s3_client=s3_client) and args.verbose:
            print("Verified bucket s3://%s exists and is accessible." %
                  (args.outputBucket))

    # Use latest file in bucket that matches prefix string as input.
    input_csv, latest_dt = get_latest_file(args.inputBucket,
                                           prefix=args.inputPrefix,
                                           s3_client=s3_client)

    if not input_csv:
        raise Exception(
            "Found no candidate CSV files in bucket %s with prefix %s." % (
                args.inputBucket,
                "(no prefix)" if not args.inputPrefix else args.inputPrefix,
            ))

    print("Got latest CSV file s3://%s/%s with write time %s." %
          (args.inputBucket, input_csv, latest_dt))

    sc = SparkContext()
    gc = GlueContext(sc)
    sparkSession = gc.spark_session
    job = Job(gc)
    job.init(args.JOB_NAME, vars(args))
    sparkSession.udf.register("json_clean", json_clean)

    # For requester payer
    sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader",
                                                "true")
    gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true")

    # Get minimum hour for existing data
    prelim_min_hour = 0
    aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER
    # * 1000000 needed since start_time is in microseconds
    prelim_min_hour = ((
        (START_TIME -
         (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000

    # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo.
    input_df = (gc.create_dynamic_frame_from_options(
        "s3",
        {
            "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)]
        },
        "csv",
        {
            "withHeader": True
        },
    ).toDF().select("*"))
    print("Input DF count: %s" % (input_df.count()))
    csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" %
                                    (prelim_min_hour))
    print("Date-bounded DF count: %s" % (csv_prelim_df.count()))

    # If --hourly is not set, query for policy hits for current hour
    # and previous 24 hours.
    # When --hourly is set, query for policy hits only for the current
    # and previous hour. (Querying for the previous hour is necessary)
    # as the previous CSV generation job may have run in the middle of the
    # hour, meaning new events for that hour may exist.)
    # (now - (now % 3600)) = start of the current hour
    # ((now - (now % 3600)) - 3600) = start of the previous hour
    hours_ago = 1 if args.hourly else 24
    increm_min_hour_sec = (START_TIME -
                           (START_TIME % 3600)) - (3600 * hours_ago)
    increm_min_hour_micro = increm_min_hour_sec * 1000000

    # When nonStrict is set, query for the latest hourly epoch in the input file
    # as well as all subsequent hourly epochs. This query may be more expensive
    # to complete, but allows for the possibility of filling in gaps if previous
    # runs of the job failed or were paused.
    if args.nonStrict:
        # We may need to re-query for transactions in the latest epoch depending on
        # when the previous query was run. Filter these rows out as well, then
        # set newest_epoch to our minimum hour epoch.
        last_seen_epoch = (
            input_df.selectExpr("cast(start_time as bigint) as ts").agg({
                "start_time":
                "max"
            }).collect().pop()[0])
        last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner
        # Make sure we don't go outside our --maxHoursAgo range.
        # This check is necessary as increm_min_hour_sec is used in the pushdown
        # predicate when querying against the Athena source table/view.
        increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec)
        increm_min_hour_micro = increm_min_hour_sec * 1000000

    print("Set minimum hour in seconds: %s" % (increm_min_hour_sec))

    csv_hits_in_range = csv_prelim_df.filter(
        "cast(start_time as bigint) < %s" % (increm_min_hour_micro))
    print("Total CSV hits in timeframe of interest: %s" %
          (csv_hits_in_range.count()))

    # Set timeframe for pushdown predicate
    pushdown = "(hour >= %s)" % (increm_min_hour_sec)

    # Get table contents starting from minimum hour settings.
    raw_data = gc.create_dynamic_frame.from_catalog(
        database=args.athenaDatabase,
        table_name=args.athenaTable,
        transformation_ctx="raw_data",
        push_down_predicate=pushdown,
    )

    # Determine what fields we should extract based on table definition.
    # Deriving the fields from the table schema is preferable to inferring
    # it from the underlying parquet because the two may differ.
    tmp_schema = get_table_schema(args.athenaDatabase,
                                  args.athenaTable,
                                  glue_client=glue_client)
    # Verify we only have one matching column on which to explode.
    if not [col[0] for col in tmp_schema].count(args.policyType) == 1:
        raise Exception(
            "Wrong number of matching %s columns found in schema: %s" %
            (args.policyType, tmp_schema))
    # Sub out the policies/parent_policies field for an explode expression.
    # Lightly modify other fields to promote clean and consistent writes to CSV file.
    schema = []
    for col in tmp_schema:
        if col[0] == args.policyType:
            schema.append("explode(%s) as policy" % (col[0]))
        elif "map" in col[1].lower() or "struct" in col[1].lower():
            schema.append("json_clean(%s) as %s" % (col[0], col[0]))
        else:
            schema.append("cast(%s as string)" % (col[0]))

    # Get targeted policies/parent_policies.
    # Should be passed to SparkSQL as quoted strings.
    pol_arr = ", ".join([
        "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter)
        if pol
    ])

    # Filter table contents according to policy hits.
    # A non-null policies field implies a non-null parent_policies field
    # (and vice versa), so it's OK to filter on just one.

    new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr(
        *schema).filter("policy in (%s)" % (pol_arr)))
    print("New policy triggers found since last job: %s" % (new_hits.count()))

    # Combine newly collected policy hits with dataframe of previous CSV contents.
    write_df = new_hits.union(csv_hits_in_range).orderBy(
        "start_time").coalesce(1)

    uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver)
    s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir,
                                       uniq))
    if args.verbose:
        print("S3 Results Location: %s" % s3_loc)

    write_df.write.option("quoteAll", True).csv(s3_loc, header=True)

    # Rename output file, if requested.
    if args.outputFilename:
        rename_resp = rename_file(
            args.outputBucket,
            args.outputFilename,
            prefix=args.outputDir,
            dont_preserve_dir=args.dontPreserveOutputDir,
            keep_orig=args.keepOrigOnRename,
        )

        if rename_resp and args.verbose:
            print("Renamed file to %s/%s." %
                  (args.outputDir, args.outputFilename))

    job.commit()
Ejemplo n.º 26
0
    # compress string stream using gzip
    with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
        gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))
    # write stream to S3
    client.put_object(Bucket=bucket, Key=key, Body=gz_buffer.getvalue())


job_args = getResolvedOptions(
    sys.argv,
    ['JOB_NAME', 'bucket', 'input-key', 'output-key', 'output-filename'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(job_args['JOB_NAME'], job_args)

client = boto3.client('s3')

data = spark.read.format('csv').options(
    header='true').load(f"s3://{job_args['bucket']}/{job_args['input_key']}")

# order columns for export
ordered_data = data \
  .select(data.ticker, data.market)

now = datetime.now()
date_time = now.strftime("%Y-%m-%d_%H:%M:%S")
destination = f"{job_args['output_key']}{date_time}"

# create destination folder
Ejemplo n.º 27
0
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import *
from awsglue.job import Job
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from datetime import datetime

# Create a Glue context
glueContext = GlueContext(SparkContext.getOrCreate())

# Create a Glue Job
job = Job(glueContext)
job.init("infrav-demo")

# Create a dynamic frame from the catalog
infrav_DyF = glueContext.create_dynamic_frame.from_catalog(database = "infrav", table_name = "vcenter")
infrav_DyF.printSchema()

# Apply schema
applymapping1 = ApplyMapping.apply(frame = infrav_DyF, mappings = [("fields.usage_average", "double", "usage_average", "double"), ("name", "string", "name", "string"), ("tags.clustername", "string", "clustername", "string"), ("tags.cpu", "string", "cpu", "string"), ("tags.dcname", "string", "dcname", "string"), ("tags.esxhostname", "string", "esxhostname", "string"),("tags.host", "string", "host", "string"), ("tags.moid", "string", "moid", "string"), ("tags.source", "string", "source", "string"), ("tags.vcenter", "string", "vcenter", "string"), ("timestamp", "int", "timestamp", "int"), ("year", "string", "year", "string"), ("month", "string", "month", "string"), ("day", "string", "day", "string"), ("hour", "string", "hour", "string")])
#Print new Schema
applymapping1.printSchema()

# Convert DyF to DF
df=applymapping1.toDF()

# Create a function that returns the desired string from a timestamp
def format_timestamp(ts):
Ejemplo n.º 28
0
import sys

from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.transforms import ApplyMapping, DropNullFields, ResolveChoice
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
## @type: DataSource
## @args: [database = "dotz_challenge_raw", table_name = "comp_boss_csv", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database="dotz_challenge_raw",
    table_name="comp_boss_csv",
    transformation_ctx="datasource0")
## @type: ApplyMapping
## @args: [mapping = [("component_id", "string", "component_id", "string"), ("component_type_id", "string", "component_type_id", "string"), ("type", "string", "type", "string"), ("connection_type_id", "string", "connection_type_id", "string"), ("outside_shape", "string", "outside_shape", "string"), ("base_type", "string", "base_type", "string"), ("height_over_tube", "double", "height_over_tube", "double"), ("bolt_pattern_long", "string", "bolt_pattern_long", "string"), ("bolt_pattern_wide", "string", "bolt_pattern_wide", "string"), ("groove", "string", "groove", "string"), ("base_diameter", "string", "base_diameter", "string"), ("shoulder_diameter", "string", "shoulder_diameter", "string"), ("unique_feature", "string", "unique_feature", "string"), ("orientation", "string", "orientation", "string"), ("weight", "string", "weight", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(
    frame=datasource0,
    mappings=[
fact_table_database = 'highereducation-dw-edudirectdb-staging-dev'
fact_table_v1 = ''
fact_table_name = args['DATABASE_STAG']
database = args[
    'DATABASE_PARQUET']  # possible values = ['highereducation-dw-edudirectdb-parquet-current', 'highereducation-dw-edudirectdb-parquet']

if args['TYPE'] == 'current_day':
    fact_table_v1 = 'lead_fact_table_' + ENVIRONMENT + '_current_v1'
    fact_table_name = 'lead_fact_table_' + ENVIRONMENT + '_current'
else:
    fact_table_v1 = 'lead_fact_table_' + ENVIRONMENT + '_v1'
    fact_table_name = 'lead_fact_table_' + ENVIRONMENT

spark = glueContext.spark_session
job = Job(glueContext)
job.init(JOB_NAME, args)

# Required tables
lead = dfu.get_dyf_frame(database=database,
                         tbl='cddirect_production_lead').toDF()
school_provider_program = dfu.get_dyf_frame(
    database=database,
    tbl='cddirect_production_school_provider_program').toDF()
school_program = dfu.get_dyf_frame(
    database=database, tbl='cddirect_production_school_program').toDF()
widget_degree = dfu.get_dyf_frame(
    database=database, tbl='cddirect_production_widget_degree').toDF()
widget_subject = dfu.get_dyf_frame(
    database=database, tbl='cddirect_production_widget_subject').toDF()
widget_category = dfu.get_dyf_frame(
    database=database, tbl='cddirect_production_widget_category').toDF()
from pyspark.sql import SparkSession
from pyspark.sql.types import NullType, StructType, StructField
from pyspark.sql.types import StringType, DateType, LongType, ArrayType
from pyspark.sql.functions import *
import itertools
import boto3
import time


args = getResolvedOptions(sys.argv, ['JOB_NAME', 'bucketName' ,'workspace', 'jdbc_url', 'username', 'pswd'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

#ccn-alerts-104328-{workspace}-sfmc-migration
workspace = args['workspace']
bucketName = args['bucketName']
jdbc_url = args['jdbc_url']
username = args['username']
pswd = args['pswd']

print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username) )

s3Location = 's3://{}'.format(bucketName)
glue_database = '{}_sfmc_migration'.format(workspace)
outputLocation = ''+s3Location+'/athena-results-ccn-data/'

print("workspace: ", workspace, ", s3Location: ", s3Location, ", glue_database: ", glue_database, ", outputLocation: ", outputLocation)
def create_job(glueContext, args):
    job = Job(glueContext)
    job.init(args["JOB_NAME"], args)
    return job