Example #1
0
from awsglue.context import GlueContext
glueConext = GlueContext(spark)


dyf = glueConext.create_dynamic_frame_from_catalog(
    database='<database>',
    table_name='<table>'
)


dyf = glueConext.create_dynamic_frame_from_options(
    connection_type='sqlserver',
    connection_options={
        "url": "jdbc:sqlserver://database-2.xxx.eu-west-1.rds.amazonaws.com:1433/test",
        "user": "******",
        "password": "******",
        "dbtable": "<table>"
    }
)

# with recurse, 
"""
in S3:
a/b1/c1/files.csv
a/b1/c2/files.csv
"""
dyf = glueContext.create_dynamic_frame.from_options(
    's3', 
    connection_options={"paths": ["s3://a/"], 'recurse':True}, 
    format='json', 
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

session = boto3.Session(region_name='us-west-2')
glue_client = session.client(service_name='glue')

## @params: [JOB_NAME]
workflowName = 'AmazonForecastWorkflow'
workflow = glue_client.get_workflow(Name=workflowName)
workflow_params = workflow['Workflow']['LastRun']['WorkflowRunProperties']
workflowRunId = workflow['Workflow']['LastRun']['WorkflowRunId']
PROCESSED_BUCKET = workflow_params['processedBucket']
LANDING_DB_NAME = workflow_params['landingDB']
LANDING_DB_TABLE = workflow_params['landingDBTable']

orders = glueContext.create_dynamic_frame_from_catalog(
    LANDING_DB_NAME, LANDING_DB_TABLE, transformation_ctx="orders")

ordersDF = orders.toDF()

ordersDF1 = ordersDF.select("invoicedate", "stockcode", "quantity",
                            "storelocation")

ordersDF2 = ordersDF1.withColumnRenamed(
    "stockcode",
    "item_id").withColumnRenamed("quantity", "demand").withColumnRenamed(
        "storelocation", "location").withColumnRenamed("invoicedate",
                                                       "timestamp")

ordersDF3 = ordersDF2.withColumn(
    'timestamp',
    F.from_unixtime(F.unix_timestamp('timestamp', 'dd/mm/yyyy hh:mm:ss'),
workspace = args['workspace']
bucketName = args['bucketName']
jdbc_url = args['jdbc_url']
username = args['username']
pswd = args['pswd']

print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url,
                                               username))

s3Location = 's3://{}'.format(bucketName)

# workspace can be any of dev, qa-r-qa2, psp, prod
glue_database = '{}_sfmc_migration'.format(workspace)
############################## CUSTOMER_CONTACT ##############################

customer_contact_df = glueContext.create_dynamic_frame_from_catalog(
    glue_database, "customer_contact")
customer_contact_df.printSchema()
customer_contact_df = customer_contact_df.toDF()
#customer_contact_df.show(truncate=False)

#glueContext.write_dynamic_frame.from_jdbc_conf(frame = customer_contact_df, catalog_connection = "mysql-db",
#                                               connection_options = {"dbtable": "CUSTOMER_CONTACT", "database": "ALERT"})

customer_contact_db_df = spark.read.jdbc(jdbc_url,
                                         "CUSTOMER_CONTACT",
                                         properties={
                                             "user": username,
                                             "password": pswd,
                                             "driver": 'com.mysql.jdbc.Driver'
                                         })
Example #4
0
try:
    print('Attempt 3: create_dynamic_frame.from_catalog')
    song_df = glueContext.create_dynamic_frame.from_catalog(
            database='sparkify',
            table_name='song_data')

    print ('Count: ', song_df.count())
    print('Schema: ')
    song_df.printSchema()
except Exception as e:
    print(e)

try:
    print('Attempt 3: create_dynamic_frame_from_catalog')
    song_df = glueContext.create_dynamic_frame_from_catalog(
            database='sparkify',
            table_name='song_data')

    print ('Count: ', song_df.count())
    print('Schema: ')
    song_df.printSchema()
except Exception as e:
    print(e)



# l_history = Join.apply(orgs,
#                        Join.apply(persons, memberships, 'id', 'person_id'),
#                        'org_id', 'organization_id').drop_fields(['person_id', 'org_id'])
# print "Count: ", l_history.count()
# l_history.printSchema()
Example #5
0
def parse_user_cards(cards_table=TABLE1,
                     user_scores_table=TABLE2,
                     users_table = TABLE3,
                     db=MYSQL_DB,
                     db_users_table=MYSQL_USERS_TABLE,
                     db_cards_table=MYSQL_CARDS_TABLE):
    """
    Description

    Arguments:
    ----------
        cards_table (str):
            "cards" table in the Influx Database
        user_scores_table (str):
            "user_scores" table in the Influx Database
        user_table (str):
            "users" table in the Influx Database
        db (str):
            Name of MySQL Database
        db_users_table (str):
            Name of "users" table in MySQL Database
        db_cards_table(str):
            Name of Cards table in Mysql Database

    Returns:
    --------
        (str):
    """
    glueContext = GlueContext(SparkContext.getOrCreate())

    # Getting data from "user_scores" table and "cards" table
    influx_user_scores_df = get_influx_dataframe(table=user_scores_table)
    influx_cards_df = get_influx_dataframe(table=cards_table)
    influx_users_df = get_influx_dataframe(table=users_table)

    # Dropping duplicates from influx_users_df and influx_cards_df
    influx_cards_df = influx_cards_df.drop_duplicates(['org_id', 'user_id', 'card_id', 'assigned_to_user_id'])
    influx_users_df = influx_users_df.drop_duplicates(["org_id", "user_id", "created_user_id", "follower_id",
                                                       "followed_user_id", "time", "event_time"])    
    # Ranking based on "org_id"
    influx_users_df = influx_users_df.withColumn("rank_org_id", rank().over(Window.partitionBy("org_id")\
                                                                 .orderBy(asc("time"))))\
                                                                 .where(col("rank_org_id") >= 1)
#                                                                .select("rank_org_id", "org_id", "actor_id") 

    # Creating a DynamicFrame for "users" and "cards" from the schema of MySQL Database
    cards = glueContext.create_dynamic_frame_from_catalog(database=db, table_name=db_cards_table)
    users = glueContext.create_dynamic_frame_from_catalog(database=db, table_name=db_users_table)

    # Converting "users" and "cards" to PySpark DataFrame
    users_df = users.toDF()\
                    .drop_duplicates(["organization_id", "id", "created_at"])

    # Duplicates with the following keys for the following actions are dropped:
    # 1. card_created    2. card_deleted card_assigned    3. card_assigned_dismissed
    # 4. card_assigned_deleted    5. card_dismissed    6. card_marked_as_complete
    # 7. card_marked_as_uncomplete    8. card_viewed
    cards_df = cards.toDF()\
                    .drop_duplicates(['organization_id', 'author_id', 'id'])

    # Selecting given columns from "users_df" from total of 62 columns and "cards_df" from
    # total of 55 columns
    users_df = users_df.select("id", "first_name", "last_name", "email",
                               "created_at", "sign_in_count", "is_suspended", "is_active")

    cards_df = cards_df.select("id", "card_type","title")

    # Concatenating "first_name" and "last_name" to create new column "full_name" in users_df
    users_df = users_df.withColumn("user_full_name",
                                   concat_ws(" ", users_df.first_name, users_df.last_name))

    # Adding "user_account_status" based on values for mutually
    # exclusive columns "is_suspended" and "is_active"
    users_df = users_df.withColumn("user_account_status", when(users_df.is_suspended == "true", "suspended")\
                                                          .when(users_df.is_active == "true", "active"))

    # Joining users from MYSQLDB and Influx on user_id
    joined_users_df = users_df.join(influx_users_df, users_df.id == influx_users_df._user_id, 'inner')\
                              .drop(influx_users_df._user_id).drop(influx_users_df.org_id)
    # Joining cards from MYSQLDB and Influx on card_id
    joined_cards_df = cards_df.join(influx_cards_df, cards_df.id == influx_cards_df._card_id, 'inner')\
                              .drop(influx_cards_df._card_id).drop(influx_cards_df.card_type)\
                              .drop(influx_cards_df.ecl_id).drop(influx_cards_df.ecl_source_name)\
                              .drop(influx_cards_df.is_public).drop(influx_cards_df.readable_card_type)\
                              .drop(influx_cards_df._user_id)
    print("Joining \'users\' table and \'cards\' tables with their corresponding influx tables completed!")

    # The columns in these df's will change based on further activity measures
    processed_users_df = joined_users_df.select("user_id", "user_full_name", "rank_org_id")
    #processed_users_df = joined_users_df.select("user_id", "user_full_name")

    processed_cards_df = joined_cards_df.select("org_id", "card_title","card_type", "user_id",
                                                "card_id", "event", "event_time","time")
    # Joined users with cards (each of which have already been joined with their corresponding influx tables)
    joined_users_cards_df = processed_users_df.join(processed_cards_df, processed_users_df.user_id == processed_cards_df.user_id, 'inner')\
                                              .drop(processed_users_df.user_id)
                                              

    joined_users_cards_df.show(1000)
Example #6
0
sc = SparkContext()
glueContext = GlueContext(sc)

job = Job(glueContext)
args = getResolvedOptions(
    sys.argv, ['JOB_NAME', "database_name", "raw_table_name", "table_name"])
job.init(args['JOB_NAME'], args)

database_name = args["database_name"]
raw_table_name = args["raw_table_name"]
table_name = args["table_name"]

mappings = [("impression_id", "string", "impression_id", "string"),
            ("click_id", "string", "click_id", "string"),
            ("ad_id", "string", "ad_id", "string"),
            ("occurred_at", "string", "occurred_at", "timestamp"),
            ("partition_0", "string", "year", "int"),
            ("partition_1", "string", "month", "int")]

events = glueContext.create_dynamic_frame_from_catalog(database_name, raw_table_name, transformation_ctx = "events-etl") \
                    .apply_mapping(mappings) \
                    .resolveChoice(choice = "MATCH_CATALOG", database = database_name, table_name = table_name)

glueContext.write_dynamic_frame.from_catalog(
    events,
    database_name,
    table_name,
    additional_options={"partitionKeys": ["year", "month"]})

job.commit()
Example #7
0
#ccn-alerts-104328-{workspace}-sfmc-migration
workspace = args['workspace']
bucketName = args['bucketName']
jdbc_url = args['jdbc_url']
username = args['username']
pswd = args['pswd']

print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url,
                                               username))

s3Location = 's3://{}'.format(bucketName)

# workspace can be any of dev, qa-r-qa2, psp, prod
glue_database = '{}_sfmc_migration'.format(workspace)

customer_alert_preference_df = glueContext.create_dynamic_frame_from_catalog(
    glue_database, "customer_alert_preference")
customer_alert_preference_df.printSchema()
customer_alert_preference_df = customer_alert_preference_df.toDF()
#customer_alert_preference_df.show(truncate=False)

customer_alert_preference_db_df = spark.read.jdbc(jdbc_url,
                                                  "CUSTOMER_ALERT_PREFERENCE",
                                                  properties={
                                                      "user":
                                                      username,
                                                      "password":
                                                      pswd,
                                                      "driver":
                                                      'com.mysql.jdbc.Driver'
                                                  })
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'bucket', 'db', 'table'])

now = datetime.now()
print(f"Start Time : {now.strftime('%Y-%m-%d %H:%M:%S')}")

db = args['db']
table = args['table']
bucket = args['bucket']

target = f"s3://{bucket}/parquet/{table}/year={now.strftime('%Y')}/month={now.strftime('%m')}/day={now.strftime('%d')}/"

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Read files
df = glueContext.create_dynamic_frame_from_catalog(database=db,
                                                   table_name=table)

# Write data to folder
out = glueContext.write_dynamic_frame.from_options(
    frame=df,
    connection_type="s3",
    connection_options={"path": target},
    format="parquet")

end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"End Time : {end}")
job.commit()