Beispiel #1
0
def write(database,
          table,
          df,
          jdbc_conf,
          cut_off_percentage,
          repartition=False,
          s3path='',
          format='csv'):
    """ Python wrapper function for Scala class CustomJDBCUtils
    @params:  
              database              - string name of the database to write to
              table                 - string name of the table being written to (format = 'schema.table')
              df                    - pyspark.sql.DataFrame to be written
              jdbc_conf             - jdbc_conf for the connection, extracted from glueContext.extract_jdbc_conf()
              cut_off_percentage    - integer representing the bad records threshhold (ex. 50 = 50%)
              s3path                - path in s3 where the bad records should be written to (optional)
              format                - file format to write out the bad records as (default = 'csv')
    """
    sc = SparkContext.getOrCreate()
    glueContext = GlueContext(sc)
    jvm = sc._jvm
    jsc = sc._jsc
    jdf = df._jdf

    if s3path is not "":
        df = DataFrame(
            jvm.com.slf.CustomJDBCUtils.write(database, table, jdf, jdbc_conf,
                                              cut_off_percentage), jsc)
        try:
            df.repartition(1).write.format(format).save(s3path)
            return df
        except:
            # Spark Writing Failed. Reverting to GlueContext
            glueContext.write_dynamic_frame_from_options(
                frame=DynamicFrame.fromDF(df, glueContext, 'dynamic_frame'),
                connection_type='s3',
                connection_options={'path': s3path},
                format=format)
    else:
        return DataFrame(
            jvm.com.slf.CustomJDBCUtils.write(database, table, jdf, jdbc_conf,
                                              cut_off_percentage), jsc)
Beispiel #2
0
    def test_duplicate_rows_in_target_are_deduped(self, mock_job):
        glue_context = GlueContext(self.spark)
        self.__mock_staging(glue_context, [
            {
                "id": "02",
                "firstname": "Bob from staging",
                "modifieddate": "2019-01-02T00:40:32Z"
            }
        ])
        self.__mock_existing_target(glue_context, [
            {
                "id": "01",
                "first_name": "John",
                "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00")
            },
            {
                "id": "02",
                "first_name": "Bob",
                "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00")
            },
            {
                "id": "01",
                "first_name": "Bill",
                "modified_date": datetime.fromisoformat("2019-01-02T00:40:32+00:00")
            }
        ])
        glue_context.write_dynamic_frame_from_options = MagicMock()
        glue_context.purge_s3_path = MagicMock()

        merge_into_customer_dim.main(self.argv, glue_context, mock_job)

        expected_df = input_df = self.spark.createDataFrame([
                ["01", "Bill", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")],
                ["02", "Bob from staging", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")]
            ], schema=self.output_schema)

        write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args
        self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
Beispiel #3
0
    def test_the_target_path_is_purged(self, mock_job):
        glue_context = GlueContext(self.spark)
        self.__mock_staging(glue_context, [
                {
                    "id": "01",
                    "firstname": "John",
                    "lastname": "Smith",
                    "birthdate": "1990-01-01",
                    "zipcode": "12345",
                    "modifieddate": "2019-01-01T00:40:32Z",
                }
            ])
        self.__mock_existing_target(glue_context, [])
        glue_context.write_dynamic_frame_from_options = MagicMock()
        glue_context.purge_s3_path = MagicMock()

        merge_into_customer_dim.main(self.argv, glue_context, mock_job)

        glue_context.purge_s3_path.assert_called_with(
            s3_path = "s3://ut_target_path",
            options = {
                "retentionPeriod": 0
            }
        )
Beispiel #4
0
    def test_no_existing_output(self, mock_job):
        glue_context = GlueContext(self.spark)
        self.__mock_staging(glue_context, [
                {
                    "id": "01",
                    "firstname": "John",
                    "lastname": "Smith",
                    "birthdate": "1990-01-01",
                    "zipcode": "12345",
                    "modifieddate": "2019-01-01T00:40:32Z",
                }
            ])
        self.__mock_existing_target(glue_context, [])
        glue_context.write_dynamic_frame_from_options = MagicMock()
        glue_context.purge_s3_path = MagicMock()

        merge_into_customer_dim.main(self.argv, glue_context, mock_job)

        expected_df = input_df = self.spark.createDataFrame([
                ["01", "John", "Smith", date(1990, 1, 1), "12345", datetime.fromisoformat("2019-01-01T00:40:32+00:00")]
            ], schema=self.output_schema)

        write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args
        self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
Beispiel #5
0
job = Job(glue_context)
job.init(args['JOB_NAME'], args)

# Job input parameters
project = args['project']
stage = args['stage']
env = Env.get_config(project, stage)
incoming_path = "s3://{}/{}/food".format(env["s3_data_lake"],
                                         env["incoming_folder"])
primary_path = "s3://{}/{}/food".format(env["s3_data_lake"],
                                        env["primary_folder"])

# Read Data
datasource0 = glue_context.create_dynamic_frame_from_options(connection_type='s3',
                                connection_options = {"paths": [incoming_path]}, format = "csv", \
                                format_options={"withHeader":True}, transformation_ctx = "readctx")

datasource0.printSchema()
datasource0.show(2)

# mask numbers in data with _number_
bpSamplePySpark = BpSamplePySpark(spark, env)
df = bpSamplePySpark.process_data(datasource0.toDF())
dyf = DynamicFrame.fromDF(df, glue_context, 'dyf')

# Write data
glue_context.write_dynamic_frame_from_options(frame = dyf, connection_type = "s3", \
                                connection_options = {"path": primary_path, \
                                }, format = "parquet", transformation_ctx = "writectx")
job.commit()
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions

args = getResolvedOptions(sys.argv, ["JOB_NAME", "table_name", "role_arn"])
table_name = args["table_name"]
role_arn = args["role_arn"]

glue_context = GlueContext(SparkContext.getOrCreate())
job = Job(glue_context)
job.init(args["JOB_NAME"], args)

dyf = glue_context.create_dynamic_frame_from_options(
    connection_type="dynamodb",
    connection_options={
        "dynamodb.region": "us-west-2",
        "dynamodb.input.tableName": table_name,
        "dynamodb.sts.roleArn": role_arn
    })
dyf.show()

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
    connection_type="dynamodb",
    connection_options={
        "dynamodb.output.tableName": table_name,
        "dynamodb.throughput.write.percent": "0.5"
    })
job.commit()
Beispiel #7
0
class generate_testdata():
    def __init__(self):

        self.glueContext = None
        self.json_obj = None
        self.new_json_obj = None
        self.df_readschema = None
        self.sqlContext = None
        self.df_tgt = None
        self.spark = None
        self.df_customer_table = None
        self.df_order_table = None

    def initialize(self):
        spark_context = SparkContext()
        self.glueContext = GlueContext(spark_context)
        self.sqlContext = SQLContext(spark_context)
        self.spark = self.glueContext.spark_session

    def extract(self):
        args_new = getResolvedOptions(sys.argv, ['cfg_file_path_new'])
        cfg_file_path_new = args_new['cfg_file_path_new']

        file_res = urlparse(cfg_file_path_new)
        s3_resource = boto3.resource('s3')
        file_obj = s3_resource.Object(file_res.netloc,
                                      file_res.path.lstrip('/'))
        content = file_obj.get()['Body'].read()
        content_new = content
        self.new_json_obj = json.loads(content_new)
        print(
            '===================================Second Json======================================='
        )
        print(content_new)
        self.df_order_table = self.glueContext.create_dynamic_frame_from_options(
            connection_type='s3',
            connection_options={
                "paths": [self.new_json_obj['Schema']['src_file_path']]
            },
            format='csv',
            format_options={
                "withHeader": True
            }).toDF()
        self.df_customer_table = self.glueContext.create_dynamic_frame_from_options(
            connection_type='s3',
            connection_options={
                "paths": [self.new_json_obj['Schema']['src_file_path_2']]
            },
            format=self.new_json_obj['Schema']['format'],
            format_options={
                "withHeader": True
            }).toDF()

        self.df_order_table.show()
        self.df_customer_table.show()

    def transform(self):
        self.df_order_table = self.df_order_table.withColumn(
            'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd'))
        self.df_order_table = self.df_order_table.withColumn(
            'order_month', func.month(col('order_datetime')))

        df_filter_cust = self.df_customer_table.where(col('age') > 18)
        ###inner join
        df_order_customer = self.df_order_table.join(
            df_filter_cust,
            on=(self.df_order_table['customer_id'] ==
                df_filter_cust['customer_id']),
            how='inner').select(df_filter_cust['customer_id'],
                                self.df_order_table['order_id'],
                                self.df_order_table['order_month'],
                                self.df_order_table['amount'])

        # total sales amount for each month of each customer who are greater than age 18
        wind = Window.partitionBy('customer_id', 'order_month')

        df_order_customer = df_order_customer.withColumn(
            'total_sale',
            func.sum(col('amount')).over(wind))

        df_order_customer.distinct()
        df_order_customer.show()

        ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss
        ########################
        wind = Window.partitionBy('customer_id', 'order_date').orderBy(
            func.col('order_id').asc())
        df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\

        df_temp = df_temp.withColumn(
            'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd'))

        df_temp = df_temp.withColumn(
            'diff_days', func.datediff('current_date', 'order_date'))

        df_temp = df_temp.withColumn(
            "diff",
            when((col('diff_days') <= lit(20)), lit(1)).otherwise(0))
        df_temp = df_temp.where(col('diff') == 1)
        wind = Window.partitionBy('customer_id')
        df_temp = df_temp.withColumn('count',
                                     func.count('order_id').over(wind))
        df_temp = df_temp.where((col('count') > 2) & (col('row') == 2))

        df_temp.show()

    def load(self):
        # self.df_tgt.coalesce(1)
        # self.df_tgt.write.mode('Overwrite').format('csv').save(
        #     's3 path',
        #     header='true')
        dynf = DynamicFrame.fromDF(self.df_tgt, self.glueContext, "glue_job")
        self.glueContext.write_dynamic_frame_from_options(
            frame=dynf,
            connection_type="s3",
            connection_options={
                "path": 's3://slf-ca-dev-glue/amp/test/testing_schema/output',
            },
            format="parquet",
            format_options={},
            transformation_ctx="")
Beispiel #8
0
job.init(args["JOB_NAME"], args)

dyf = glue_context.create_dynamic_frame_from_options(
    connection_type="dynamodb",
    connection_options={
        "dynamodb.region": region,
        "dynamodb.splits": str(ddb_split),
        "dynamodb.throughput.read.percent": "1.2",
        "dynamodb.input.tableName": source_ddb_name
    })
dyf.show()

sts_client = boto3.client('sts')
sts_response = sts_client.assume_role(RoleArn=target_role_arn,
                                      RoleSessionName='assume-role')

ddb_key = sts_response['Credentials']['AccessKeyId']
ddb_secret = sts_response['Credentials']['SecretAccessKey']
ddb_token = sts_response['Credentials']['SessionToken']

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
    connection_type="dynamodb",
    connection_options={
        "dynamodb.region": region,
        "dynamodb.output.tableName": target_ddb_name,
        "dynamodb.awsAccessKeyId": ddb_key,
        "dynamodb.awsSecretAccessKey": ddb_secret,
        "dynamodb.awsSessionToken": ddb_token
    })
job.commit()
# import necessary dependencies
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import sys

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)

job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# extract source s3 file as glue dynamic frame using data catalogue.
dynf_s3_src = glueContext.create_dynamic_frame.from_catalog(database='ctg_db_s3',
                                                            table_name='ctg_tbl_nm_corresponding_to_s3_file',
                                                            transformation_ctx="s3_src_extract")

# load s3 file using glue api.
glueContext.write_dynamic_frame_from_options(frame=dynf_s3_src,
                                             connection_type="s3",
                                             connection_options={"path": 's3://bucket_name/path'},
                                             format="parquet",
                                             format_options={},
                                             transformation_ctx="")

job.commit()

Beispiel #10
0
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions

args = getResolvedOptions(sys.argv, ["JOB_NAME"])
glue_context = GlueContext(SparkContext.getOrCreate())
job = Job(glue_context)
job.init(args["JOB_NAME"], args)

dyf = glue_context.create_dynamic_frame.from_options(
    connection_type="dynamodb",
    connection_options={
        "dynamodb.input.tableName": "logfile_scan",
        "dynamodb.throughput.read.percent": "1.0",
        "dynamodb.splits": "100"
    })

print(dyf.getNumPartitions())

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
    connection_type="dynamodb",
    connection_options={
        "dynamodb.output.tableName": "logfile_scan_2",
        "dynamodb.throughput.write.percent": "1.0"
    })

job.commit()
dyf = glue_context.create_dynamic_frame.from_options(
    connection_type='dynamodb',
    connection_options={
        'dynamodb.input.tableName': 'NatureRemo',
        'dynamodb.splits': '72'  # Should be (DPU-1)*8
    })

dyf.printSchema()
dyf = dyf.resolveChoice(specs=[('value', 'cast:double')])
dyf = dyf.repartition(
    1)  # You need to modify this number to your preferred file number.

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
    connection_type='s3',
    connection_options={'path': "s3://example-bucket/glue/ddb2s3/"},
    format='parquet')

# You will need to specify partitionKeys if you want to write data with hive partitioning based on column 'year', 'month', and 'day
#glue_context.write_dynamic_frame_from_options(
#    frame=dyf,
#    connection_type='s3',
#    connection_options={
#        'path': "s3://example-bucket/glue/ddb2s3/"
#        'partitionKeys': ["year", "month", "day"]
#    },
#    format='parquet'
#)

job.commit()
Beispiel #12
0
args = getResolvedOptions(sys.argv, [‘JOB_NAME’])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args[‘JOB_NAME’], args)

# Initialize the Dynamic frame using Glue Data Catalog DB and Table.  Replace <GlueDatabaseName> and <GlueTableName> with Glue DB and table names respectively

Source = glueContext.create_dynamic_frame.from_catalog(database = “<GlueDatabaseName>”, table_name = “<GlueTableName>”, transformation_ctx = “Source”)


# Map the source field names and data types to target values. The target values should be exactly the same as the source DyanmoDB table values
Mapped = ApplyMapping.apply(frame = Source, mappings = [
                        (“item.ID.S”, “string”, “ID”, “string”), 
                        (“item.date.M”, “map”, “date”, “map”),  
                        (“item.location.M.lng.S”, “string”, “location.lng”, “string”), 
                        (“item.location.M.lat.S”, “string”, “location.lat”, “string”)],
                        transformation_ctx = “Mapped”)

# Write to target DynamoDB table. Replace <TargetRegion> and <TargetDynamoDBTable> with region and table name respectively
glueContext.write_dynamic_frame_from_options(
    frame=Mapped,
    connection_type=“dynamodb”,
    connection_options={
        “dynamodb.region”: “<TargetRegion>”,
        “dynamodb.output.tableName”: “<TargetDynamoDBTable>”,
        “dynamodb.throughput.write.percent”: “1.0”
    }
)
job.commit()
Beispiel #13
0
#             "N": "0"
#         }
#     }
# }

mapped = ApplyMapping.apply(
    frame=source,
    mappings=[("item.Code.S", "string", "Code", "string"),
              ("item.Title.S", "string", "Title", "string"),
              ("item.Date.S", "string", "Date", "string"),
              ("item.Comments.S", "string", "Comments", "string"),
              ("item.Author.S", "string", "Author", "string"),
              ("item.LastPushDateTime.S", "string", "LastPushDateTime",
               "string"), ("item.Link.S", "string", "Link", "string"),
              ("item.ID.N", "integer", "ID", "integer"),
              ("item.Board.S", "string", "Board", "string"),
              ("item.PushSum.N", "integer", "PushSum", "integer")],
    transformation_ctx="mapped")

mapped.show()

glueContext.write_dynamic_frame_from_options(
    frame=mapped,
    connection_type="dynamodb",
    connection_options={
        "dynamodb.output.tableName": "articles",
        "dynamodb.throughput.write.percent": "1.0"
    })

job.commit()