class JobRunner(object):
    """This class manages the orchestration of new jobs, including parsing arguments and control flow"""
    SERVICE_DEFINITIONS = {
        'alb': [ALBRawCatalog, ALBConvertedCatalog],
        'elb': [ELBRawCatalog, ELBConvertedCatalog],
        'cloudtrail': [CloudTrailRawCatalog, CloudTrailConvertedCatalog],
        'cloudfront': [CloudFrontRawCatalog, CloudFrontConvertedCatalog],
        's3_access': [S3AccessRawCatalog, S3AccessConvertedCatalog],
        'vpc_flow': [VPCFlowRawCatalog, VPCFlowConvertedCatalog],
    }

    def __init__(self, service_name):
        args = getResolvedOptions(sys.argv, ['JOB_NAME'] + self._job_arguments())

        # Validate the service name
        if not self.is_valid_service(service_name):
            raise Exception("'%s' is not yet a supported service." % service_name)

        self.glue_context = self._init_glue_context()
        self.job = Job(self.glue_context)
        region = self.get_instance_region()

        # Create data catalog references
        raw_klas = self.SERVICE_DEFINITIONS[service_name][0]
        converted_klas = self.SERVICE_DEFINITIONS[service_name][1]

        self.raw_catalog = raw_klas(
            region,
            args['raw_database_name'],
            args['raw_table_name'],
            args['s3_source_location']
        )
        self.optimized_catalog = converted_klas(
            region,
            args['converted_database_name'],
            args['converted_table_name'],
            args['s3_converted_target']
        )

        # Assume that if the raw table does not exist, this is our first run
        self.initial_run = not self.raw_catalog.does_table_exist()

        # Create a converter object and initialize the glue job!
        self.converter = DataConverter(self.glue_context, self.raw_catalog, self.optimized_catalog)
        self.job.init(args['JOB_NAME'], args)

    @staticmethod
    def is_valid_service(service_name):
        """Determines whether the given service_name is a supported service or not"""
        return service_name in JobRunner.SERVICE_DEFINITIONS

    def get_instance_region(self):
        """Retrieve the current AWS Region from the Instance Metadata"""
        contents = urllib2.urlopen("http://169.254.169.254/latest/dynamic/instance-identity/document").read()
        return json.loads(contents).get('region')

    def create_tables_if_needed(self):
        """If this is the initial run of the Job, create both the raw and optmized tables in the Data Catalog"""
        if self.initial_run is True:
            # TODO: Fail if the table already exists, or for converted tables if the S3 path already exists
            LOGGER.info("Initial run, scanning S3 for partitions.")
            self.raw_catalog.initialize_table_from_s3()
            # Note that if the source table is partitionless, this is a null-op.
            self.optimized_catalog.initialize_with_partitions(self.raw_catalog.partitioner.build_partitions_from_s3())

    def add_new_raw_partitions(self):
        """For the raw catalog, check and see if any new partitions exist for UTC today.

        Continue this check for every day previous until we reach a day where a partition exists."""
        if self.initial_run is not True:
            LOGGER.info("Recurring run, only looking for recent partitions on raw catalog.")
            self.raw_catalog.add_recent_partitions()

    def add_new_optimized_partitions(self):
        """For the optimized catalog, check and see if any new partitions exist for UTC today.
        Continue this check for every day previous until we reach a day where a partition exists.
        
        If this is the initial run, add whatever partitions we can find.
        """
        if self.initial_run and isinstance(self.raw_catalog.partitioner, NullPartitioner):
            LOGGER.info("Initial run with source NullPartitioner, adding all partitions from S3.")
            self.optimized_catalog.get_and_create_partitions()
        else:
            self.optimized_catalog.add_recent_partitions()

    def trigger_conversion(self):
        """Trigger the DataConverter"""
        self.converter.run()

    def finish(self):
        """Take any actions necessary to finish the job"""
        self.job.commit()

    def convert_and_partition(self):
        """A wrapper for the most common operations of these jobs. This allows for a simple one-line
        interface to the consumer, but allows them to use more-specific methods if need be.
        """
        self.create_tables_if_needed()
        self.add_new_raw_partitions()
        self.trigger_conversion()
        self.add_new_optimized_partitions()
        self.finish()

    @staticmethod
    def _job_arguments():
        return [
            'raw_database_name',
            'raw_table_name',
            'converted_database_name',
            'converted_table_name',
            's3_source_location',
            's3_converted_target'
        ]

    @staticmethod
    def _init_glue_context():
        # Imports are done here so we can isolate the configuration of this job
        from awsglue.context import GlueContext
        from pyspark.context import SparkContext
        spark_context = SparkContext.getOrCreate()
        spark_context._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")  # noqa pylint: disable=protected-access
        spark_context._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false")  # noqa pylint: disable=protected-access
        return GlueContext(spark_context)
            raw_df = pd.concat([pd.read_csv(i) for i in all_filenames], join="outer")
            data = raw_df['0'].str.split(r'Trade Name:', expand=True)
            raw_df['Trade'] = data[1]
            raw_df['0'] = data[0]
            data = raw_df['0'].str.split(r'Trade Name:', expand=True)
            raw_df['Trade'] = data[1]
            raw_df['0'] = data[0]
            raw_df.to_csv(destination_file, index=False, encoding='utf-8-sig')
    #        raw_df = wr.s3.read_csv(path=f, path_suffix = ['.csv'], dataset=True)
    # wr.s3.delete_objects(f"s3://{dest_bucket}/{destination_file}")
    #     raw_df.to_csv(f,index=False, encoding='utf-8-sig')

    # Calling the function with latest available files
    date = (datetime.today() - timedelta(0)).strftime('%Y%m%d')
    year = (datetime.today() - timedelta(0)).strftime('%Y-%m-%d').split('-')[0]
    month = (datetime.today() - timedelta(0)).strftime('%Y-%m-%d').split('-')[1]
    day = (datetime.today() - timedelta(0)).strftime('%Y-%m-%d').split('-')[2]
    filenames = ['growers', 'processor', 'transporter', 'dispensaries', 'laboratory', 'waste_disposal']
    for file in filenames:
        raw_path_dir = 'US/OK/CannabisLifecycle/' + file + '/' + year + '/' + month + '/' + day + '/'
    source_filename = 'US/OK/CannabisLifecycle/' + file + '_' + date + '.pdf'
    dest_filename = 'US/OK/CannabisLifecycle/' + file + '/' + year + '/' + month + '/' + day + '/' + file + '_' + date
    combained = 'US/OK/CannabisLifecycle/' + file + '/' + year + '/' + month + '/' + day + '/' + file + '_final_' + date + '.csv'
    pdf_to_csv(source_bucket, source_filename, dest_bucket, dest_filename)
    csvToCombinedcsv(dest_bucket, combained, dest_bucket, raw_path_dir)
    for j in all_filenames:
        if dest_filename in j:
           s3.rm(j)

job.commit()
latestTimestampVal=functions.getMaxValue(returnDF,"endtime",job_configs)
functions.updateLastProcessedTSValue(clusterId+"_stl_utilitytext",latestTimestampVal[0],job_configs)

#### Alert Event Log  #####

stlAlertEventLogProcessedTSValue = functions.getLastProcessedTSValue(clusterId+"_stl_alert_event_log",job_configs)
returnDF=functions.runQuery("select '{}' as clusterId,trunc(event_time) as startDate,* from stl_alert_event_log where event_time > '{}'".format(clusterId,stlAlertEventLogProcessedTSValue),"stl_alert_event_log",job_configs)
functions.saveToS3(returnDF,s3Prefix,"stl_alert_event_log",["clusterid","startdate"],job_configs)
latestTimestampVal=functions.getMaxValue(returnDF,"event_time",job_configs)
functions.updateLastProcessedTSValue(clusterId+"_stl_alert_event_log",latestTimestampVal[0],job_configs)


#### STL_SCAN #####

stlScanLastProcessedTSValue= functions.getLastProcessedTSValue(clusterId+"_stl_scan",job_configs)
returnDF=functions.runQuery("select '{}' as clusterId,trunc(starttime) as startDate,* from stl_scan where endtime > '{}'".format(clusterId,stlScanLastProcessedTSValue),"stl_scan",job_configs)
functions.saveToS3(returnDF,s3Prefix,"stl_scan",["clusterid","startdate"],job_configs)
latestTimestampVal=functions.getMaxValue(returnDF,"endtime",job_configs)
functions.updateLastProcessedTSValue(clusterId+"_stl_scan",latestTimestampVal[0],job_configs)

#### STL_WLM_QUERY #####

stlWLMQueryLastProcessedTSValue= functions.getLastProcessedTSValue(clusterId+"_stl_wlm_query",job_configs)
returnDF=functions.runQuery("select '{}' as clusterId,trunc(queue_start_time) as startDate,* from stl_wlm_query where queue_end_time > '{}'".format(clusterId,stlWLMQueryLastProcessedTSValue),"stl_wlm_query",job_configs)
functions.saveToS3(returnDF,s3Prefix,"stl_wlm_query",["clusterid","startdate"],job_configs)
latestTimestampVal=functions.getMaxValue(returnDF,"queue_end_time",job_configs)
functions.updateLastProcessedTSValue(clusterId+"_stl_wlm_query",latestTimestampVal[0],job_configs)


job.commit()
                       .withColumn('Records_frm_src', F.lit(INPUT_RECORDS)) \
                       .withColumn('Datasink_Elapsed_Time',
                                   F.lit(DATASINK_ELAPSED_TIME)) \
                       .withColumn('Job_Elapsed_Time', F.lit(JOB_ELAPSED_TIME))

#Creating Temp View for COUNTS_DF2
COUNTS_DF2.createOrReplaceTempView("final_counts_dataframe")

#One DataLog Dataframe Written To Cloud Watch Logs
#Flat Files dont have any op_val column so counts cant be calculated
AUDITING_COUNTS_DF = SPARK.sql("""
  select
  JobName as JobName, 
  JobRunId as JobId,
  Job_Start_Time,
  Job_End_Time,
  Job_Elapsed_Time,
  --coalesce(InsertsVal, 0) as InsertVal,
  --coalesce(UpdatesVal, 0) as UpdateVal,
  --coalesce(DeletesVal, 0) as DeleteVal,
  Datasink_Elapsed_Time,
  Records_frm_src,
  StepLog as StepLog
  from 
  final_counts_dataframe
""")

AUDITING_COUNTS_DF.show(10, False)

JOB.commit()
Example #5
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'partition_column', 'output_bucket_name',
        'primary_key', 'parallelism', 'date_column', 'datetime_column'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    date_column = args['date_column']
    datetime_column = args['datetime_column']
    hudi_primary_key = args['primary_key']
    output_bucket_name = args['output_bucket_name']
    parallelism = args['parallelism']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    spark = SparkSession.builder.config(
        'spark.serializer',
        'org.apache.spark.serializer.KryoSerializer').getOrCreate()
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)
    logger = glue_context.get_logger()
    logger.info('Initialization.')

    # Initialization of Glue client to connect to Glue Catalog and retrieve table information
    glueClient = boto3.client('glue')

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    isTableExists = False
    try:
        glueClient.get_table(DatabaseName=clean_db_name,
                             Name=target_entity_name)
        isTableExists = True
        logger.info(clean_db_name + '.' + target_entity_name + ' exists.')
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            isTableExists = False
            logger.info(clean_db_name + '.' + target_entity_name +
                        ' does not exist. Table will be created.')

    partition_path = '' if partition_column == 'None' else partition_column

    common_config = {
        'className':
        'org.apache.hudi',
        'hoodie.datasource.hive_sync.use_jdbc':
        'false',
        'hoodie.index.type':
        'GLOBAL_BLOOM',
        'hoodie.datasource.write.precombine.field':
        datetime_column,
        'hoodie.datasource.write.recordkey.field':
        hudi_primary_key,
        'hoodie.table.name':
        target_entity_name,
        'hoodie.consistency.check.enabled':
        'true',
        'hoodie.datasource.hive_sync.database':
        clean_db_name,
        'hoodie.datasource.hive_sync.table':
        target_entity_name,
        'hoodie.datasource.hive_sync.enable':
        'true',
        'hoodie.datasource.write.partitionpath.field':
        partition_path,
        'hoodie.datasource.hive_sync.partition_fields':
        partition_path,
        'hoodie.datasource.hive_sync.partition_extractor_class':
        'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column
        == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor',
        'hoodie.datasource.write.hive_style_partitioning':
        'false' if partition_column == 'None' else 'true',
        'hoodie.datasource.write.keygenerator.class':
        'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column
        == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator'
    }

    incremental_config = {
        'hoodie.upsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert',
        'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS',
        'hoodie.cleaner.commits.retained': 10
    }

    initLoad_config = {
        'hoodie.bulkinsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert'
    }

    if (isTableExists):
        logger.info('Incremental upsert.')
        combinedConf = {**common_config, **incremental_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Append').save("s3://" + output_bucket_name +
                                                "/" + clean_table_name)
    else:
        logger.info('Inital load.')
        combinedConf = {**common_config, **initLoad_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Overwrite').save("s3://" +
                                                   output_bucket_name + "/" +
                                                   clean_table_name)

    job.commit()
Example #6
0
from pyspark.context import SparkContext
from awsglue.job import Job
import sys
from awsglue.utils import getResolvedOptions
from awsglue.dynamicframe import DynamicFrame

glueContext = GlueContext(SparkContext.getOrCreate())
glueJob = Job(glueContext)
args = getResolvedOptions(sys.argv,['JOB_NAME'])

glueJob.init(args['JOB_NAME'],args)
#sparkSession = glueContext.sparkSession
spark = glueContext.spark_session

#df = sparkSession.read.csv("s3a://pkm")
#df.show()

#dfnew = spark.read.option("header","true").option("delimiter", ",").csv("s3a://pkm")
df = spark.read.option("header","true").format("csv").load("s3a://pkm
#inputGDF = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": ["s3://pkm"], "recurse":True}, format = "csv")
#df=inputGDF.toDF()
df.show(2)

dynamic_dframe = DynamicFrame.fromDF(df, glueContext, "dynamic_df")
 
##Write the DynamicFrame as a file in CSV format to a folder in an S3 bucket.
##It is possible to write to any Amazon data store (SQL Server, Redshift, etc) by using any previously defined connections.
retDatasink4 = glueContext.write_dynamic_frame.from_options(frame = dynamic_dframe, connection_type = "s3", connection_options = {"path": "s3://pkm-target"}, format = "csv", transformation_ctx = "datasink4")

glueJob.commit()
Example #7
0
def main(args):
    if args.verbose:
        print("Got arguments: %s" % (args))

    glue_client = boto3.client("glue", region_name=args.region)
    s3_client = boto3.client("s3", region_name=args.region)

    # Verify source DB and table exist
    if validate_db(args.athenaDatabase,
                   glue_client=glue_client) and args.verbose:
        print("Validated source database %s exists." % (args.athenaDatabase))
    if (validate_table(
            args.athenaDatabase, args.athenaTable, glue_client=glue_client)
            and args.verbose):
        print("Validated source table %s exists." % (args.athenaTable))

    # Verify input and output buckets exist and are accessible.
    for bucket in [args.outputBucket, args.inputBucket]:
        if validate_bucket(bucket, s3_client=s3_client) and args.verbose:
            print("Verified bucket s3://%s exists and is accessible." %
                  (args.outputBucket))

    # Use latest file in bucket that matches prefix string as input.
    input_csv, latest_dt = get_latest_file(args.inputBucket,
                                           prefix=args.inputPrefix,
                                           s3_client=s3_client)

    if not input_csv:
        raise Exception(
            "Found no candidate CSV files in bucket %s with prefix %s." % (
                args.inputBucket,
                "(no prefix)" if not args.inputPrefix else args.inputPrefix,
            ))

    print("Got latest CSV file s3://%s/%s with write time %s." %
          (args.inputBucket, input_csv, latest_dt))

    sc = SparkContext()
    gc = GlueContext(sc)
    sparkSession = gc.spark_session
    job = Job(gc)
    job.init(args.JOB_NAME, vars(args))
    sparkSession.udf.register("json_clean", json_clean)

    # For requester payer
    sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader",
                                                "true")
    gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true")

    # Get minimum hour for existing data
    prelim_min_hour = 0
    aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER
    # * 1000000 needed since start_time is in microseconds
    prelim_min_hour = ((
        (START_TIME -
         (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000

    # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo.
    input_df = (gc.create_dynamic_frame_from_options(
        "s3",
        {
            "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)]
        },
        "csv",
        {
            "withHeader": True
        },
    ).toDF().select("*"))
    print("Input DF count: %s" % (input_df.count()))
    csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" %
                                    (prelim_min_hour))
    print("Date-bounded DF count: %s" % (csv_prelim_df.count()))

    # If --hourly is not set, query for policy hits for current hour
    # and previous 24 hours.
    # When --hourly is set, query for policy hits only for the current
    # and previous hour. (Querying for the previous hour is necessary)
    # as the previous CSV generation job may have run in the middle of the
    # hour, meaning new events for that hour may exist.)
    # (now - (now % 3600)) = start of the current hour
    # ((now - (now % 3600)) - 3600) = start of the previous hour
    hours_ago = 1 if args.hourly else 24
    increm_min_hour_sec = (START_TIME -
                           (START_TIME % 3600)) - (3600 * hours_ago)
    increm_min_hour_micro = increm_min_hour_sec * 1000000

    # When nonStrict is set, query for the latest hourly epoch in the input file
    # as well as all subsequent hourly epochs. This query may be more expensive
    # to complete, but allows for the possibility of filling in gaps if previous
    # runs of the job failed or were paused.
    if args.nonStrict:
        # We may need to re-query for transactions in the latest epoch depending on
        # when the previous query was run. Filter these rows out as well, then
        # set newest_epoch to our minimum hour epoch.
        last_seen_epoch = (
            input_df.selectExpr("cast(start_time as bigint) as ts").agg({
                "start_time":
                "max"
            }).collect().pop()[0])
        last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner
        # Make sure we don't go outside our --maxHoursAgo range.
        # This check is necessary as increm_min_hour_sec is used in the pushdown
        # predicate when querying against the Athena source table/view.
        increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec)
        increm_min_hour_micro = increm_min_hour_sec * 1000000

    print("Set minimum hour in seconds: %s" % (increm_min_hour_sec))

    csv_hits_in_range = csv_prelim_df.filter(
        "cast(start_time as bigint) < %s" % (increm_min_hour_micro))
    print("Total CSV hits in timeframe of interest: %s" %
          (csv_hits_in_range.count()))

    # Set timeframe for pushdown predicate
    pushdown = "(hour >= %s)" % (increm_min_hour_sec)

    # Get table contents starting from minimum hour settings.
    raw_data = gc.create_dynamic_frame.from_catalog(
        database=args.athenaDatabase,
        table_name=args.athenaTable,
        transformation_ctx="raw_data",
        push_down_predicate=pushdown,
    )

    # Determine what fields we should extract based on table definition.
    # Deriving the fields from the table schema is preferable to inferring
    # it from the underlying parquet because the two may differ.
    tmp_schema = get_table_schema(args.athenaDatabase,
                                  args.athenaTable,
                                  glue_client=glue_client)
    # Verify we only have one matching column on which to explode.
    if not [col[0] for col in tmp_schema].count(args.policyType) == 1:
        raise Exception(
            "Wrong number of matching %s columns found in schema: %s" %
            (args.policyType, tmp_schema))
    # Sub out the policies/parent_policies field for an explode expression.
    # Lightly modify other fields to promote clean and consistent writes to CSV file.
    schema = []
    for col in tmp_schema:
        if col[0] == args.policyType:
            schema.append("explode(%s) as policy" % (col[0]))
        elif "map" in col[1].lower() or "struct" in col[1].lower():
            schema.append("json_clean(%s) as %s" % (col[0], col[0]))
        else:
            schema.append("cast(%s as string)" % (col[0]))

    # Get targeted policies/parent_policies.
    # Should be passed to SparkSQL as quoted strings.
    pol_arr = ", ".join([
        "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter)
        if pol
    ])

    # Filter table contents according to policy hits.
    # A non-null policies field implies a non-null parent_policies field
    # (and vice versa), so it's OK to filter on just one.

    new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr(
        *schema).filter("policy in (%s)" % (pol_arr)))
    print("New policy triggers found since last job: %s" % (new_hits.count()))

    # Combine newly collected policy hits with dataframe of previous CSV contents.
    write_df = new_hits.union(csv_hits_in_range).orderBy(
        "start_time").coalesce(1)

    uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver)
    s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir,
                                       uniq))
    if args.verbose:
        print("S3 Results Location: %s" % s3_loc)

    write_df.write.option("quoteAll", True).csv(s3_loc, header=True)

    # Rename output file, if requested.
    if args.outputFilename:
        rename_resp = rename_file(
            args.outputBucket,
            args.outputFilename,
            prefix=args.outputDir,
            dont_preserve_dir=args.dontPreserveOutputDir,
            keep_orig=args.keepOrigOnRename,
        )

        if rename_resp and args.verbose:
            print("Renamed file to %s/%s." %
                  (args.outputDir, args.outputFilename))

    job.commit()
#now = datetime.datetime.now(timezone('Asia/Seoul'))
now = datetime.datetime.utcnow()
print('current time : ' + str(now)) 
target_date_hour= now + datetime.timedelta(hours=-1)
print('taget time : ' + str(target_date_hour))

timepartition = target_date_hour.strftime('_year=%Y/_month=%m/_day=%d/_hour=%H/')

for work  in targetlist:
    src  = 's3://{bucket}/{path}/{timepart}'.format( bucket=bucketname, path=work['src'], timepart=timepartition  )
    dest = 's3://{bucket}/{path}/{timepart}'.format( bucket=bucketname, path=work['dest'], timepart=timepartition  )
    print( 'source src : ' + src)
    print( 'target src : ' + dest)
    dir_exist =  checkdir_ifexist( bucketname, '{path}/{timepart}'.format(path=work['src'], timepart=timepartition) ) 
    if not dir_exist:
        print( 'Not yet created json path : {0}'.format(src))
        continue
#     if checkdir_ifexist(bucketname, '{path}/{timepart}'.format(path=work['dest'], timepart=timepartition) ):
#         print( 'alreay exist dest path , so delete all parquet before make it :' + dest)
#         delete_destfile( bucketname,  '{path}/{timepart}'.format(path=work['dest'], timepart=timepartition))    
    print( ' transforming work started for source src : ' + src)    
    json2parquet( src, dest)
    print( ' transforming work ended in  dest path : ' + dest)    
    

print("############ json2parquet job ended ########")


# Glue에서 실행시
Job.commit()
Example #9
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'datetime_column', 'date_column',
        'partition_column', 'output_bucket_name'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    datetime_column = args['datetime_column']
    date_column = args['date_column']
    output_bucket_name = args['output_bucket_name']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    ## @type: DataSink
    ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"]
    ## @return: sink
    ## @inputs: [frame = enriched_data]

    sink = glue_context.getSink(connection_type="s3",
                                path="s3://" + output_bucket_name + "/" +
                                clean_table_name,
                                enableUpdateCatalog=True,
                                updateBehavior="UPDATE_IN_DATABASE",
                                partitionKeys=[partition_column])
    sink.setFormat("glueparquet")
    sink.setCatalogInfo(catalogDatabase=clean_db_name,
                        catalogTableName=clean_table_name)
    sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result'))

    job.commit()