Beispiel #1
0
    transformation_ctx="applymapping2")
applymapping2 = GremlinCsvTransforms.create_prefixed_columns(
    applymapping2, [('~id', 'lineId', 'od')])
selectfields2 = SelectFields.apply(frame=applymapping2,
                                   paths=[
                                       "~id", "lineNumber", "quantity",
                                       "unitPrice", "discount", "supplyCost",
                                       "tax"
                                   ],
                                   transformation_ctx="selectfields2")

selectfields2.toDF().foreachPartition(neptune.add_vertices('OrderDetail'))

print "Creating ORDER_DETAIL edges..."

applymapping3 = RenameField.apply(applymapping2, "~id", "~to")
applymapping3 = GremlinCsvTransforms.create_prefixed_columns(
    applymapping3, [('~from', 'orderId', 'o')])
applymapping3 = GremlinCsvTransforms.create_edge_id_column(
    applymapping3, '~from', '~to')
selectfields3 = SelectFields.apply(frame=applymapping3,
                                   paths=["~id", "~from", "~to", "lineNumber"],
                                   transformation_ctx="selectfields3")

selectfields3.toDF().foreachPartition(neptune.add_edges('ORDER_DETAIL'))

print "Creating PRODUCT edges..."

applymapping4 = RenameField.apply(applymapping2, "~id", "~from")
applymapping4 = GremlinCsvTransforms.create_prefixed_columns(
    applymapping4, [('~to', 'productId', 'p')])
def etl_user_communication(dyf_crm_concats):
    print_is_dev('etl_user_user_communication___****************************************************************************')
    # Cache all data(Extra data)
    # ----------------------------------------------------------------------------------------------------------------#
    dyf_adv_phones = connectGlue(database='tig_advisor', table_name='student_contact_phone',
                                  select_fields=["contact_id", "phone","deleted"],
                                  fillter=['contact_id', 'phone'])
    dyf_adv_phones= RenameField.apply(dyf_adv_phones,"deleted","is_deleted")
    count_adv_phones = dyf_adv_phones.count()
    if count_adv_phones < 1:
        return
    # ----------------------------------------------------------------------------------------------------------------#
    dyf_adv_advisor = connectGlue(database='tig_advisor', table_name='student_contact',
                                  select_fields=["contact_id", "student_id"],
                                  fillter=['contact_id', 'student_id'])
    count_adv_advisor = dyf_adv_advisor.count()
    if count_adv_advisor < 1:
        return
    # ----------------------------------------------------------------------------------------------------------------#
    dyf_toa_enterprise = connectGlue(database='dm_toa', table_name='toa_mapping_student_enterprise',
                                  select_fields=["lms_id", "enterprise_id"],
                                  fillter=['lms_id', 'enterprise_id'])

    count_toa_enterprise = dyf_toa_enterprise.count()
    if count_toa_enterprise < 1:
        return

    dyf_toa_enterprise.printSchema()
    # --------------------------------------------------------------------------------------------------------------#
    # Start Tranform

    print_is_dev("id and code----------------------------------------------------")
    print_is_dev(dyf_crm_concats.count())

    df_crm_concats = dyf_crm_concats.toDF()
    df_concat_enterprise_clear = df_crm_concats.dropDuplicates(['Id', 'Code'])
    dyf_concat_enterprise_clear=DynamicFrame.fromDF(df_concat_enterprise_clear, glueContext, 'dyf_concat_enterprise_clear')

    print_is_dev(dyf_concat_enterprise_clear.count())

    dyf_toa_enterprise = dyf_toa_enterprise.resolveChoice(specs=[('enterprise_id', 'cast:string')])
    dyf_jon_contact_enterprise = Join.apply(dyf_concat_enterprise_clear, dyf_adv_advisor, 'Code', 'contact_id')
    dyf_jon_contact_enterprise = Join.apply(dyf_toa_enterprise, dyf_jon_contact_enterprise, 'lms_id', 'student_id')
    dyf_jon_contact_enterprise = RenameField.apply(dyf_jon_contact_enterprise, "enterprise_id", "comunication")
    comunication_add_collum['communication_type']=f.lit(8)
    dyf_concat_enterprise = mappingForAll(dyf_jon_contact_enterprise, comunication, comunication_add_collum)

    if is_dev:
        print('dyf_concat_enterprise')
        dyf_concat_enterprise.printSchema()
        dyf_concat_enterprise.show(10)
    ########  save to s3######
    # parquetToS3(dyf_concat_enterprise, path="s3://dtsodin/student_behavior/up/user_comunication")

    save_communication_redshift(dyf_concat_enterprise)
    ##########################################

    # ETL for phone comunication
    dyf_jon_contact_phones = Join.apply(dyf_concat_enterprise_clear, dyf_adv_phones, 'Code', 'contact_id')
    dyf_jon_contact_phones = RenameField.apply(dyf_jon_contact_phones, "phone", "comunication")
    comunication_add_collum['communication_type']=f.lit(1)
    dyf_concat_phone = mappingForAll(dyf_jon_contact_phones,comunication,comunication_add_collum_for_phone)

    if is_dev:
        print('dyf_concat_phone')
        dyf_concat_phone.printSchema()
        dyf_concat_phone.show(10)

    ########  save to s3######
    #parquetToS3(dyf_concat_phone, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_phone)
    ##########################################


    # ETL for username comunication
    dyf_crm_concat_name = fillterOutNull(dyf_crm_concats, ['Fullname'])
    df_crm_concat_name=dyf_crm_concat_name.toDF()
    df_concat_name_clear = df_crm_concat_name.dropDuplicates(['Id', 'Code','Fullname'])
    dyf_concat_name_clear=DynamicFrame.fromDF(df_concat_name_clear, glueContext, 'dyf_concat_name_clear')
    dyf_crm_concat_name = RenameField.apply(dyf_concat_name_clear, "Fullname", "comunication")
    # set comunication type
    comunication_add_collum['communication_type'] = f.lit(4)

    dyf_concat_name = mappingForAll(dyf_crm_concat_name, comunication, comunication_add_collum)


    if is_dev:
        print('dyf_concat_name')
        dyf_concat_name.printSchema()
        dyf_concat_name.show(10)

    ########  save to s3######
    #parquetToS3(dyf_concat_name, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_name)
    ##########################################

    # ETL for email comunication

    dyf_crm_concat_emails = fillterOutNull(dyf_crm_concats, ['Email'])
    df_crm_concat_emails=dyf_crm_concat_emails.toDF()
    df_crm_concat_emails = df_crm_concat_emails.dropDuplicates(['Id', 'Code','Email'])
    dyf_crm_concat_emails=DynamicFrame.fromDF(df_crm_concat_emails, glueContext, 'dyf_crm_concat_emails')
    dyf_crm_concat_email = RenameField.apply(dyf_crm_concat_emails, "Email", "comunication")

    comunication_add_collum['communication_type']=f.lit(2)
    dyf_concat_email = mappingForAll(dyf_crm_concat_email,comunication,comunication_add_collum)

    if is_dev:
        print('dyf_concat_email')
        dyf_concat_email.printSchema()
        dyf_concat_email.show(10)
    ########  save to s3######
    #parquetToS3(dyf_concat_email, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_email)
    ##########################################


    # ETL for email2 comunication
    dyf_crm_concat_email_2 = fillterOutNull(dyf_crm_concats, ['Email2'])
    df_crm_concat_email_2=dyf_crm_concat_email_2.toDF()
    df_crm_concat_email_2 = df_crm_concat_email_2.dropDuplicates(['Id', 'Code','Email2'])
    dyf_crm_concat_email_2=DynamicFrame.fromDF(df_crm_concat_email_2, glueContext, 'dyf_crm_concat_email_2')

    dyf_crm_concat_email_2 = RenameField.apply(dyf_crm_concat_email_2, "Email", "comunication")
    comunication_add_collum['is_primary']=f.lit(0)

    dyf_concat_email2 = mappingForAll(dyf_crm_concat_email_2, comunication,comunication_add_collum)

    if is_dev:
        print('dyf_concat_email2')
        dyf_concat_email2.printSchema()
        dyf_concat_email2.show(10)
    ########  save to s3######
    #parquetToS3(dyf_concat_email2, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_email2)
    ##########################################


    # ETL for address comunication
    dyf_crm_concat_address = fillterOutNull(dyf_crm_concats, ['Address'])
    df_crm_concat_address=dyf_crm_concat_address.toDF()
    df_crm_concat_address = df_crm_concat_address.dropDuplicates(['Id', 'Code','Address'])
    dyf_crm_concat_address=DynamicFrame.fromDF(df_crm_concat_address, glueContext, 'dyf_crm_concat_address')

    dyf_crm_concat_address = RenameField.apply(dyf_crm_concat_address, "Address", "comunication")
    comunication_add_collum['is_primary'] = f.lit(1)
    comunication_add_collum['communication_type'] = f.lit(6)
    dyf_concat_address = mappingForAll(dyf_crm_concat_address,comunication, comunication_add_collum)

    if is_dev:
        print('dyf_concat_address')
        dyf_concat_address.printSchema()
        dyf_concat_address.show(10)

    ########  save to s3######
    #parquetToS3(dyf_concat_address, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_address)
selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1")

selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100))

# Supplier vertices

print("Creating Supplier vertices...")

datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3")

applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1")
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')])
selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3")

selectfields3.toDF().foreachPartition(gremlin_client.upsert_vertices('Supplier', batch_size=100))

# SUPPLIER edges

print("Creating SUPPLIER edges...")

applymapping1 = RenameField.apply(applymapping1, "~id", "~from")
applymapping1 = GlueGremlinCsvTransforms.create_edge_id_column(applymapping1, '~from', '~to')
selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["~id", "~from", "~to"], transformation_ctx = "selectfields2")
        
selectfields2.toDF().foreachPartition(gremlin_client.upsert_edges('SUPPLIER', batch_size=100))

# End

job.commit()

print("Done")
def etl_user_map(dyf_crm_concats):
    print_is_dev('etl_user_map___*******************************************************************************************')

    # ETL for mapper
    df_crm_concats= dyf_crm_concats.toDF()
    df_crm_concat_clear= df_crm_concats.dropDuplicates(['Id', 'Code'])
    dyf_crm_concat_clear=DynamicFrame.fromDF(df_crm_concat_clear, glueContext, 'dyf_crm_concat_clear')

    dyf_crm_concat_map = RenameField.apply(dyf_crm_concat_clear, "Code", "source_id")
    dyf_crm_concat_map = mappingForAll(dyf_crm_concat_map, mapper, mapper_add_collum)

    if is_dev:
        print('dyf_crm_concat_map')
        dyf_crm_concat_map.printSchema()
        dyf_crm_concat_map.show(10)

    ########  save to s3######
    #parquetToS3(dyf_crm_concat_map,path="s3://dtsodin/student_behavior/up/user_mapper")

    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_crm_concat_map,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "up_user_map",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_map",
                                                   transformation_ctx="datasink4")

    ############################################################

    print_is_dev('etl_user_map___********************************************************************************************')
    # ---------------------------------------------------------------------------------------------------------------#
    dyf_adv_student = connectGlue(database='tig_advisor', table_name='student_contact',
                                   select_fields=["contact_id", "student_id"],
                                   fillter=['contact_id', 'student_id'])
    count_adv_student = dyf_adv_student.count()
    if count_adv_student < 1:
        return

    dyf_adv_student.show(10)
    # -----------------------------------------------------------------------------------------------------------------#
    # Start Tranform

    # ETL for phone comunication
    dyf_crm_concat_clear = DynamicFrame.fromDF(df_crm_concat_clear, glueContext, 'dyf_crm_concat_clear')

    dyf_join_contact_student = Join.apply(dyf_adv_student, dyf_crm_concat_clear, 'contact_id', 'Code')
    dyf_adv_student = RenameField.apply(dyf_join_contact_student, "student_id", "source_id")
    mapper_add_collum['source_type']=f.lit(2)
    dyf_adv_student = mappingForAll(dyf_adv_student, mapper,mapper_add_collum)
    dyf_adv_student.show(10)
    ########  save to s3######
    #parquetToS3(dyf_adv_student, path="s3://dtsodin/student_behavior/up/user_mapper")
    ############################################################
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_adv_student,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "up_user_map",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_map",
                                                   transformation_ctx="datasink4")