def main(args, yesterday):
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    database = args['database_name']
    table_name = '{0}kinesis_data'.format(args['table_prefix'])

    partition_predicate = \
        'partition_0="{0}" and partition_1="{1:02}" and partition_2="{2:02}"'.format(
            yesterday.year, yesterday.month, yesterday.day)

    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database=database,
        table_name=table_name,
        push_down_predicate=partition_predicate,
        transformation_ctx='datasource0')

    if datasource0.count() > 0:
        applymapping1 = ApplyMapping.apply(frame=datasource0,
                                           mappings=[('device_id', 'string',
                                                      'device_id', 'string'),
                                                     ('timestamp', 'long',
                                                      'timestamp', 'long')],
                                           transformation_ctx='applymapping1')

        df = applymapping1.toDF()
        dev = df.drop_duplicates(['device_id'])
        device_list = dev.collect()
        result_list = []

        for device in device_list:
            device_id = device['device_id']
            df2 = df[df.device_id == device_id]
            df_list = df2.sort('timestamp', ascending=True).collect()

            result_list.extend(create_diff_timestamp_list(device_id, df_list))

        df = spark.createDataFrame(result_list)
        dyf = DynamicFrame.fromDF(df, glueContext, 'dyf')

        # Hive format
        output_path = os.path.join(args['target_bucket'],
                                   'year={0}'.format(yesterday.year),
                                   'month={0:02}'.format(yesterday.month),
                                   'day={0:02}'.format(yesterday.day))

        glueContext.write_dynamic_frame.from_options(
            frame=dyf,
            connection_type='s3',
            connection_options={'path': output_path},
            format='parquet',
            transformation_ctx='datasink2')

    job.commit()
Beispiel #2
0
def mappingForAll(dynamicFrame, mapping):
    applymapping2 = ApplyMapping.apply(frame=dynamicFrame,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")

    return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping):

    # for k, v in add_collum.items():
    #     df_communication = df_communication.withColumn(k, v)

    applymapping2 = ApplyMapping.apply(frame=dynamicFrame,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")
    return dyf_communication_after_mapping
Beispiel #4
0
def retrieve_dynamic_frame(glue_context, spark, data, header, name, mapping):
    data_frame = spark.createDataFrame(data, header)
    dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, name)

    display(dynamic_frame, "retrieve" + name)

    apply_dynamic_frame = ApplyMapping.apply(frame=dynamic_frame,
                                             mappings=mapping)
    dyf_result = ResolveChoice.apply(frame=apply_dynamic_frame,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")

    display(dynamic_frame, "apply" + name)

    return dyf_result
def mappingForAll(dynamicFrame,mapping,add_collum):
    df_communication = dynamicFrame.toDF()
    df_communication = df_communication.dropDuplicates()

    for k, v in add_collum.items():
        df_communication = df_communication.withColumn(k, v)

    dyf_communication = DynamicFrame.fromDF(df_communication, glueContext, 'dyf_communication')
    applymapping2 = ApplyMapping.apply(frame=dyf_communication,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")
    return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping):
    df = dynamicFrame.toDF()
    df = df.dropDuplicates()
    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")
    print("-------------------------------------------------")
    dyf.printSchema()
    print(mapping)

    applymapping2 = ApplyMapping.apply(frame=dyf, mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_mapping = DropNullFields.apply(frame=resolvechoice2,
                                       transformation_ctx="dropnullfields2")
    return dyf_mapping
## @type: ApplyMapping
## @args: [mapping = [("col0", "string", "col0", "string"), ("col1", "string", "col1", "string"), ("col2", "string", "col2", "string"), ("col3", "string", "col3", "string"), ("col4", "string", "col4", "string"), ("col5", "string", "col5", "string"), ("col6", "string", "col6", "string"), ("col7", "string", "col7", "string"), ("col8", "string", "col8", "string"), ("col9", "string", "col9", "string"), ("col10", "string", "col10", "string"), ("col11", "string", "col11", "string"), ("col12", "string", "col12", "string"), ("col13", "string", "col13", "string"), ("col14", "string", "col14", "string"), ("col15", "string", "col15", "string"), ("col16", "string", "col16", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(
    frame=datasource0,
    mappings=[
        ("col0", "string", "col0", "string"),
        ("col1", "string", "col1", "string"),
        ("col2", "string", "col2", "string"),
        ("col3", "string", "col3", "string"),
        ("col4", "string", "col4", "string"),
        ("col5", "string", "col5", "string"),
        ("col6", "string", "col6", "string"),
        ("col7", "string", "col7", "string"),
        ("col8", "string", "col8", "string"),
        ("col9", "string", "col9", "string"),
        ("col10", "string", "col10", "string"),
        ("col11", "string", "col11", "string"),
        ("col12", "string", "col12", "string"),
        ("col13", "string", "col13", "string"),
        ("col14", "string", "col14", "string"),
        ("col15", "string", "col15", "string"),
        ("col16", "string", "col16", "string"),
    ],
    transformation_ctx="applymapping1",
)
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"]
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database="dotz_challenge_raw",
    table_name="price_quote_csv",
    transformation_ctx="datasource0",
)
## @type: ApplyMapping
## @args: [mapping = [("tube_assembly_id", "string", "tube_assembly_id", "string"), ("supplier", "string", "supplier", "string"), ("quote_date", "string", "quote_date", "string"), ("annual_usage", "long", "annual_usage", "long"), ("min_order_quantity", "long", "min_order_quantity", "long"), ("bracket_pricing", "string", "bracket_pricing", "string"), ("quantity", "long", "quantity", "long"), ("cost", "double", "cost", "double")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(
    frame=datasource0,
    mappings=[
        ("tube_assembly_id", "string", "tube_assembly_id", "string"),
        ("supplier", "string", "supplier", "string"),
        ("quote_date", "string", "quote_date", "string"),
        ("annual_usage", "long", "annual_usage", "long"),
        ("min_order_quantity", "long", "min_order_quantity", "long"),
        ("bracket_pricing", "string", "bracket_pricing", "string"),
        ("quantity", "long", "quantity", "long"),
        ("cost", "double", "cost", "double"),
    ],
    transformation_ctx="applymapping1",
)
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"]
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(
    frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2"
)
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
Beispiel #9
0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database="demo-mysql",
    table_name="salesdb_supplier",
    transformation_ctx="datasource0")
print "MySQL Rows read : " + str(datasource0.count())
datasource0.toDF().show()
## @type: ApplyMapping
## @args: [mapping = [("country", "string", "country", "string"), ("address", "string", "address", "string"), ("name", "string", "name", "string"), ("state", "string", "state", "string"), ("supplier_id", "int", "supplier_id", "int"), ("city", "string", "city", "string"), ("phone", "string", "phone", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(
    frame=datasource0,
    mappings=[("COUNTRY", "string", "country", "string"),
              ("ADDRESS", "string", "address", "string"),
              ("NAME", "string", "name", "string"),
              ("STATE", "string", "state", "string"),
              ("SUPPLIER_ID", "int", "supplier_id", "int"),
              ("CITY", "string", "city", "string"),
              ("PHONE", "string", "phone", "string")],
    transformation_ctx="applymapping1")
print "Source count datasource0: " + str(applymapping1.count())
applymapping1.toDF().show()
## @type: SelectFields
## @args: [paths = ["country", "address", "city", "phone", "name", "state", "supplier_id"], transformation_ctx = "selectfields2"]
## @return: selectfields2
## @inputs: [frame = applymapping1]
selectfields2 = SelectFields.apply(frame=applymapping1,
                                   paths=[
                                       "country", "address", "city", "phone",
                                       "name", "state", "supplier_id"
                                   ],
def main():
    """This script will load data from the supplied DynamoDB Table to S3 so it can be analyzed with Athena"""
    if (JOB_TYPE == "issues"):
        DDB_TABLE_NAME = DDB_ISSUES_TABLE_NAME
        GLUE_TABLE_NAME = GLUE_ISSUES_TABLE_NAME
        FIELD_PATHS = [
            "eventid", "acknowledged", "created", "sitename", "issuesource",
            "priority",
            "areaname#status#processname#eventdescription#stationname#devicename#created",
            "version", "devicename", "devicename#eventid", "createdat",
            "areaname", "processname", "createddateutc", "eventdescription",
            "areaname#status#processname#stationname#devicename#created",
            "stationname", "id", "acknowledgedtime", "status", "updatedat",
            "closed", "resolutiontime", "createdby", "acknowledgedby",
            "closedby", "rejectedby", "additionaldetails"
        ]
    elif (JOB_TYPE == "hierarchy"):
        DDB_TABLE_NAME = DDB_DATA_HIERARCHY_TABLE_NAME
        GLUE_TABLE_NAME = GLUE_DATA_HIERARCHY_TABLE_NAME
        FIELD_PATHS = [
            "createdat", "name", "description", "id", "devicestationid",
            "type", "version", "parentid", "updatedat", "areasiteid",
            "eventprocessid", "eventtype", "priority", "rootcauses", "sms",
            "eventimgkey", "email", "protocol", "endpoint", "filterpolicy",
            "subscriptionarn", "stationareaid", "processareaid", "alias"
        ]
    else:
        raise JobInputException(
            f"JOB_TYPE was invalid ({JOB_TYPE}). Expecting either \"issues\" or \"hierarchy\""
        )

    log_message([
        "Running with the following context:",
        f"DDB_TABLE_NAME: {DDB_TABLE_NAME}",
        f"GLUE_TABLE_NAME: {GLUE_TABLE_NAME}", f"GLUE_DB_NAME: {GLUE_DB_NAME}",
        f"GLUE_OUTPUT_BUCKET: {GLUE_OUTPUT_BUCKET}"
    ])

    DDB_TABLE_NAME_FORMATTED = DDB_TABLE_NAME.lower().replace('-', '_')

    log_message("Mapping columns")
    COLUMN_MAPPINGS = list(map(lambda x: get_column_mapping(x), FIELD_PATHS))

    log_message("Creating a Dynamic Frame from the DynamoDB table schema")
    datasource0 = glue_context.create_dynamic_frame.from_catalog(
        database=GLUE_DB_NAME,
        table_name=DDB_TABLE_NAME_FORMATTED,
        transformation_ctx="datasource0")

    log_message("Applying column mappings")
    applymapping1 = ApplyMapping.apply(frame=datasource0,
                                       mappings=COLUMN_MAPPINGS,
                                       transformation_ctx="applymapping1")

    log_message("Selecting fields")
    selectfields2 = SelectFields.apply(frame=applymapping1,
                                       paths=FIELD_PATHS,
                                       transformation_ctx="selectfields2")

    log_message("Resolving")
    resolvechoice3 = ResolveChoice.apply(frame=selectfields2,
                                         choice="MATCH_CATALOG",
                                         database=GLUE_DB_NAME,
                                         table_name=GLUE_TABLE_NAME,
                                         transformation_ctx="resolvechoice3")

    resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3,
                                         choice="make_struct",
                                         transformation_ctx="resolvechoice4")

    log_message("Persisting data in S3")
    glue_context.write_dynamic_frame.from_catalog(
        frame=resolvechoice4,
        database=GLUE_DB_NAME,
        table_name=GLUE_TABLE_NAME,
        transformation_ctx="datasink5")

    job.commit()
    log_message("Done")
Beispiel #11
0
    transformation_ctx="datasource0")
## @type: ApplyMapping
## @args: [mapping = [("component_id", "string", "component_id", "string"), ("component_type_id", "string", "component_type_id", "string"), ("type", "string", "type", "string"), ("connection_type_id", "string", "connection_type_id", "string"), ("outside_shape", "string", "outside_shape", "string"), ("base_type", "string", "base_type", "string"), ("height_over_tube", "double", "height_over_tube", "double"), ("bolt_pattern_long", "string", "bolt_pattern_long", "string"), ("bolt_pattern_wide", "string", "bolt_pattern_wide", "string"), ("groove", "string", "groove", "string"), ("base_diameter", "string", "base_diameter", "string"), ("shoulder_diameter", "string", "shoulder_diameter", "string"), ("unique_feature", "string", "unique_feature", "string"), ("orientation", "string", "orientation", "string"), ("weight", "string", "weight", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(
    frame=datasource0,
    mappings=[
        ("component_id", "string", "component_id", "string"),
        ("component_type_id", "string", "component_type_id", "string"),
        ("type", "string", "type", "string"),
        ("connection_type_id", "string", "connection_type_id", "string"),
        ("outside_shape", "string", "outside_shape", "string"),
        ("base_type", "string", "base_type", "string"),
        ("height_over_tube", "double", "height_over_tube", "double"),
        ("bolt_pattern_long", "string", "bolt_pattern_long", "string"),
        ("bolt_pattern_wide", "string", "bolt_pattern_wide", "string"),
        ("groove", "string", "groove", "string"),
        ("base_diameter", "string", "base_diameter", "string"),
        ("shoulder_diameter", "string", "shoulder_diameter", "string"),
        ("unique_feature", "string", "unique_feature", "string"),
        ("orientation", "string", "orientation", "string"),
        ("weight", "string", "weight", "string"),
    ],
    transformation_ctx="applymapping1",
)
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"]
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
def etl_student_advisor():
    print_is_dev(
        "etl_advisor_profile___****************************************************************************************"
    )
    # ETL for profile
    # dyf_student_advisor = connectGlue(database="tig_advisor", table_name="log_change_assignment_advisor",
    #     #                                   select_fields=["id", "contact_id", "advisor_id_old", "advisor_id_new","created_at", "updated_at"],
    #     #                                   fillter=["id"],
    #     #                                   duplicates=["id"])

    # ------------------------------------------------------------------------------------------------------------------#
    df_student_advisor = get_dyf_student_advisor(glueContext)

    number_student_advisor = df_student_advisor.count()
    if is_dev:
        print("number_advisor_account: ", number_student_advisor)
        df_student_advisor.printSchema()
        df_student_advisor.show(3)

    if number_student_advisor < 1:
        return

    # ------------------------------------------------------------------------------------------------------------------#
    # process contact_id with advisor from log_change_assignment_advisor and student_contact

    df_contact_id_list = df_student_advisor.select('contact_id')

    df_contact_id_list = df_contact_id_list.dropDuplicates(['contact_id'])

    df_change_advisor_new = get_df_change_advisor_new(df_student_advisor)
    #df_change_advisor_init = get_df_change_advisor_init(df_student_advisor)

    #df_advisor_from_student_contact = get_advisor_from_student_contact(glueContext, df_contact_id_list)

    df_change_advisor_total = df_change_advisor_new
    # .union(df_change_advisor_init)
    #.union(df_advisor_from_student_contact)

    if is_dev:
        print('df_change_advisor_total')
        df_change_advisor_total.printSchema()
        df_change_advisor_total.show(3)
    #-------------------------------------------------------------------------------------------------------------------#


    df_change_advisor_total = df_change_advisor_total\
        .withColumn('created_at',
                    f.unix_timestamp('created_at', format='yyyy-MM-dd HH:mm:ss'))

    df_change_advisor_total = df_change_advisor_total.orderBy(
        "contact_id", "created_at")

    if is_dev:
        print('df_change_advisor_total_after_order')
        df_change_advisor_total.printSchema()
        df_change_advisor_total.show(3)

    df_change_advisor_total = df_change_advisor_total.select(
        "contact_id",
        f.struct('advisor_id', 'created_at').alias('end_start'))

    print('df_change_advisor_total')
    df_change_advisor_total.printSchema()
    df_change_advisor_total.show(3)

    df_change_advisor_group_contact = df_change_advisor_total.groupBy("contact_id") \
        .agg(f.collect_list("end_start").alias("l_created_at"))

    print('df_change_advisor_group_contact_______________')
    df_change_advisor_group_contact.printSchema()
    df_change_advisor_group_contact.show(3)

    df_change_advisor_group_contact = df_change_advisor_group_contact\
        .withColumn("l_created_at", transpose_start_end_udf("l_created_at"))

    df_change_advisor_group_contact.printSchema()
    df_change_advisor_explore = df_change_advisor_group_contact\
        .select("contact_id",
        f.explode("l_created_at").alias("start_end_transpose")
    )

    df_change_advisor_explore.printSchema()
    df_change_advisor_explore = df_change_advisor_explore.select(
        "contact_id",
        f.col("start_end_transpose").getItem("advisor_id").alias("advisor_id"),
        f.col("start_end_transpose").getItem("start").alias("start"),
        f.col("start_end_transpose").getItem("end").alias("end"))
    df_change_advisor_explore.printSchema()
    df_change_advisor_explore = addCollum(df_change_advisor_explore,
                                          STUDENT_ADVISOR_ADD_COL)
    dyf_change_advisor_explore = DynamicFrame.fromDF(
        df_change_advisor_explore, glueContext, "dyf_change_advisor_explore")
    # dyf_change_advisor_explore = mappingForAll(dyf_change_advisor_explore, mapping = STUDENT_ADVISOR_MAPPING)

    print('dyf_change_advisor_explore')
    dyf_change_advisor_explore.printSchema()
    dyf_change_advisor_explore.show(10)

    ########  save to s3######
    # parquetToS3(dyf_student_advisor_mapping, path="s3://dtsodin/student_behavior/advisor_thangvm/advisor_history")
    ############################################################
    # ------------------------------------------------------------------------------------------------------------------#

    # | -- student_advisor_id: string
    # | -- contact_id: string
    # | -- advisor_id: string
    # | -- start_date: string
    # | -- end_date: string
    # | -- created_at: int
    # | -- updated_at: string

    apply_ouput = ApplyMapping.apply(
        frame=dyf_change_advisor_explore,
        mappings=[("contact_id", "string", "contact_id", "string"),
                  ("advisor_id", "string", "advisor_id", "long"),
                  ("start", "string", "start_date", "long"),
                  ("end", "string", "end_date", "long"),
                  ("created_at", "int", "created_at", "long")])
    #
    dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")

    glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dfy_output,
        catalog_connection="glue_redshift",
        connection_options={
            # "preactions": """TRUNCATE TABLE ad_student_advisor """,
            "dbtable": "ad_student_advisor",
            "database": "transaction_log"
        },
        redshift_tmp_dir=
        "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_package",
        transformation_ctx="datasink4")
Beispiel #13
0
DYNAMODB_INPUT_TABLE_NAME = 'aws-glue-local-test-table'
S3_OUTPUT_BUCKET_NAME = 'aws-glue-local-test-bucket'

args = getResolvedOptions(sys.argv, ['JOB_NAME'])

JOB_NAME = args['JOB_NAME']

sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(JOB_NAME, args)

datasource = glueContext.create_dynamic_frame.from_options(
    connection_type='dynamodb',
    connection_options={'dynamodb.input.tableName': DYNAMODB_INPUT_TABLE_NAME})

applymapping = ApplyMapping.apply(
    frame=datasource,
    mappings=[('Id', 'string', 'Id', 'string'),
              ('Column1', 'string', 'Column1', 'string'),
              ('Column2', 'string', 'Column2', 'string'),
              ('Column3', 'string', 'Column3', 'string')])

glueContext.write_dynamic_frame.from_options(
    frame=datasource,
    connection_type='s3',
    connection_options={'path': 's3://' + S3_OUTPUT_BUCKET_NAME},
    format='csv')

job.commit()
Beispiel #14
0
relationalized_dynamic_frames = Relationalize.apply(
    frame=ec2s,
    staging_path=tempDir,
    transformation_ctx="relationalized_dynamic_frames")

# 'roottable' is the default prefix of relationalization
rdf = relationalized_dynamic_frames.select(
    'roottable_Reservations.val.Instances')

rdf.show(1)

# 2. Map fields to bulk load CSV column headings format
applymapping1 = ApplyMapping.apply(
    frame=rdf,
    mappings=[("`Reservations.val.Instances.val.InstanceId`", "string",
               "instanceId:String", "string"),
              ("`Reservations.val.Instances.val.InstanceType`", "string",
               "instanceType:String", "string")],
    transformation_ctx="applymapping1")

applymapping1.show(1)

# 3. Append prefixes to values in ID columns (ensures vertices for different types have unique IDs across graph)
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'instanceId:String', 'ec2')])

# 4. Select fields for upsert
selectfields1 = SelectFields.apply(
    frame=applymapping2,
    paths=["~id", 'instanceId:String', 'instanceType:String'],
    transformation_ctx="selectfields1")
Beispiel #15
0
print "Creating Order vertices..."

datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database=database,
    table_name=order_table,
    transformation_ctx="datasource0")
df0 = datasource0.toDF().filter(col("ORDER_DATE") == checkpoint)
datasource1 = DynamicFrame.fromDF(df0, glueContext, 'datasource1')

print "Total orders         : " + str(datasource0.count())
print "Orders for checkpoint: " + str(datasource1.count())

applymapping1 = ApplyMapping.apply(
    frame=datasource1,
    mappings=[("ORDER_DATE", "timestamp", "orderDate", "string"),
              ("SHIP_MODE", "string", "shipMode", "string"),
              ("SITE_ID", "double", "siteId", "int"),
              ("ORDER_ID", "int", "orderId", "int")],
    transformation_ctx="applymapping1")
applymapping1 = GremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'orderId', 'o')])
selectfields1 = SelectFields.apply(frame=applymapping1,
                                   paths=["~id", "orderDate", "shipMode"],
                                   transformation_ctx="selectfields1")

selectfields1.toDF().foreachPartition(neptune.add_vertices('Order'))

print "Creating OrderDetail vertices..."

datasource2 = glueContext.create_dynamic_frame.from_catalog(
    database=database,
Beispiel #16
0
    "Select a.*, bround(a.QUANTITY*a.UNIT_PRICE,2) as EXTENDED_PRICE, \
bround(QUANTITY*(UNIT_PRICE-SUPPLY_COST) ,2) as PROFIT, \
DATE_FORMAT(ORDER_DATE,'yyyyMMdd') as DATE_KEY \
from (Select * from tbl0) a")
df1.show(5)
datasource4 = DynamicFrame.fromDF(df1, glueContext, 'datasource4')

applymapping1 = ApplyMapping.apply(
    frame=datasource4,
    mappings=[("DISCOUNT", "decimal(10,2)", "discount", "decimal(10,2)"),
              ("UNIT_PRICE", "decimal(10,2)", "unit_price", "decimal(10,2)"),
              ("TAX", "decimal(10,2)", "tax", "decimal(10,2)"),
              ("SUPPLY_COST", "decimal(10,2)", "supply_cost", "decimal(10,2)"),
              ("PRODUCT_ID", "int", "product_id", "int"),
              ("QUANTITY", "int", "quantity", "int"),
              ("LINE_ID", "int", "line_id", "int"),
              ("LINE_NUMBER", "int", "line_number", "int"),
              ("ORDER_DATE", "timestamp", "order_date", "timestamp"),
              ("SHIP_MODE", "string", "ship_mode", "string"),
              ("SITE_ID", "double", "site_id", "int"),
              ("PROFIT", "decimal(10,2)", "profit", "decimal(10,2)"),
              ("EXTENDED_PRICE", "decimal(10,2)", "extended_price",
               "decimal(10,2)"), ("DATE_KEY", "string", "date_key", "string"),
              ("ORDER_ID", "int", "order_id", "int")],
    transformation_ctx="applymapping1")
print "After ApplyMapping :"
applymapping1.printSchema()
applymapping1.toDF().show(5, False)

#selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["quantity", "discount", "tax", "unit_price", "line_id", "date_key", "order_date", "extended_price", "ship_mode", "line_number", "product_id", "site_id", "supply_cost", "order_id", "profit"], transformation_ctx = "selectfields2")

selectfields2 = SelectFields.apply(frame=applymapping1,
product_table = '{}salesdb_product'.format(args['TABLE_PREFIX'])
product_category_table = '{}salesdb_product_category'.format(args['TABLE_PREFIX'])
supplier_table = '{}salesdb_supplier'.format(args['TABLE_PREFIX'])

gremlin_endpoints = GlueNeptuneConnectionInfo(args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(args['NEPTUNE_CONNECTION_NAME'])
gremlin_client = GlueGremlinClient(gremlin_endpoints)

# Product vertices

print("Creating Product vertices...")

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_table, transformation_ctx = "datasource0")
datasource1 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_category_table, transformation_ctx = "datasource1")
datasource2 = datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join")

applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "string"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx = "applymapping1")
applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping1, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')])
selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1")

selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100))

# Supplier vertices

print("Creating Supplier vertices...")

datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3")

applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1")
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')])
selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3")
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0")
datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product_category", transformation_ctx = "datasource1")

datasource3=datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join")
print "Joined Rows : "+str(datasource3.count())
datasource3.printSchema()
## @type: ApplyMapping
## @args: [mapping = [("name", "string", "name", "string"), ("unit_price", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("product_id", "double", "product_id", "int"), ("quantity_per_unit", "double", "quantity_per_unit", "int"), ("category_id", "double", "category_id", "int"), ("supplier_id", "double", "supplier_id", "int")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource3]
applymapping1 = ApplyMapping.apply(frame = datasource3, mappings = [("NAME", "string", "name", "string"), ("UNIT_PRICE", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("PRODUCT_ID", "int", "product_id", "int"), ("QUANTITY_PER_UNIT", "int", "quantity_per_unit", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplier_id", "int"), ("CATEGORY_NAME", "string", "category_name", "string"), ("DESCRIPTION", "string", "description", "string"), ("IMAGE_URL", "string", "image_url", "string")], transformation_ctx = "applymapping1")
#applymapping1.toDF().show()
## @type: SelectFields
## @args: [paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2"]
## @return: selectfields2
## @inputs: [frame = applymapping1]
selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2")
print "selectfields2 Rows : "+str(selectfields2.count())
selectfields2.toDF().show()
## @type: DataSink
## @args: [database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = TempDir, transformation_ctx = "datasink3"]
## @return: datasink3
## @inputs: [frame = selectfields2]
datasink3 = glueContext.write_dynamic_frame.from_catalog(frame = selectfields2, database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink3")
job.commit()
Beispiel #19
0
sdf_row_data = raw_dynamic_frame.toDF()
sdf_row_data.select(column_sort_values).show()

sdf_conveted_column_to_sort_data = sdf_row_data.withColumn(
    column_sort_values,
    col(column_sort_values).cast("timestamp"))
sdf_sorted_data = sdf_conveted_column_to_sort_data.orderBy(
    desc(column_sort_values))

sdf_deduplicate_data = sdf_sorted_data.drop_duplicates(
    [column_drop_duplicates])

types_mapping_dict = load_mapping_types(jsonObject,
                                        sdf_deduplicate_data.dtypes)

df_deduplicate_data = DynamicFrame.fromDF(sdf_deduplicate_data, glueContext,
                                          "df_deduplicate_data")

df_converted_type_data = ApplyMapping.apply(
    frame=df_deduplicate_data,
    mappings=types_mapping_dict,
    transformation_ctx="df_converted_type_data")

datasink = glueContext.write_dynamic_frame.from_options(
    frame=df_converted_type_data,
    connection_type="s3",
    connection_options={"path": "s3://glue-terraform/data/output"},
    format="parquet",
    transformation_ctx="datasink")

job.commit()