from awsglue.context import GlueContext glueConext = GlueContext(spark) dyf = glueConext.create_dynamic_frame_from_catalog( database='<database>', table_name='<table>' ) dyf = glueConext.create_dynamic_frame_from_options( connection_type='sqlserver', connection_options={ "url": "jdbc:sqlserver://database-2.xxx.eu-west-1.rds.amazonaws.com:1433/test", "user": "******", "password": "******", "dbtable": "<table>" } ) # with recurse, """ in S3: a/b1/c1/files.csv a/b1/c2/files.csv """ dyf = glueContext.create_dynamic_frame.from_options( 's3', connection_options={"paths": ["s3://a/"], 'recurse':True}, format='json',
glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session session = boto3.Session(region_name='us-west-2') glue_client = session.client(service_name='glue') ## @params: [JOB_NAME] workflowName = 'AmazonForecastWorkflow' workflow = glue_client.get_workflow(Name=workflowName) workflow_params = workflow['Workflow']['LastRun']['WorkflowRunProperties'] workflowRunId = workflow['Workflow']['LastRun']['WorkflowRunId'] PROCESSED_BUCKET = workflow_params['processedBucket'] LANDING_DB_NAME = workflow_params['landingDB'] LANDING_DB_TABLE = workflow_params['landingDBTable'] orders = glueContext.create_dynamic_frame_from_catalog( LANDING_DB_NAME, LANDING_DB_TABLE, transformation_ctx="orders") ordersDF = orders.toDF() ordersDF1 = ordersDF.select("invoicedate", "stockcode", "quantity", "storelocation") ordersDF2 = ordersDF1.withColumnRenamed( "stockcode", "item_id").withColumnRenamed("quantity", "demand").withColumnRenamed( "storelocation", "location").withColumnRenamed("invoicedate", "timestamp") ordersDF3 = ordersDF2.withColumn( 'timestamp', F.from_unixtime(F.unix_timestamp('timestamp', 'dd/mm/yyyy hh:mm:ss'),
workspace = args['workspace'] bucketName = args['bucketName'] jdbc_url = args['jdbc_url'] username = args['username'] pswd = args['pswd'] print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username)) s3Location = 's3://{}'.format(bucketName) # workspace can be any of dev, qa-r-qa2, psp, prod glue_database = '{}_sfmc_migration'.format(workspace) ############################## CUSTOMER_CONTACT ############################## customer_contact_df = glueContext.create_dynamic_frame_from_catalog( glue_database, "customer_contact") customer_contact_df.printSchema() customer_contact_df = customer_contact_df.toDF() #customer_contact_df.show(truncate=False) #glueContext.write_dynamic_frame.from_jdbc_conf(frame = customer_contact_df, catalog_connection = "mysql-db", # connection_options = {"dbtable": "CUSTOMER_CONTACT", "database": "ALERT"}) customer_contact_db_df = spark.read.jdbc(jdbc_url, "CUSTOMER_CONTACT", properties={ "user": username, "password": pswd, "driver": 'com.mysql.jdbc.Driver' })
try: print('Attempt 3: create_dynamic_frame.from_catalog') song_df = glueContext.create_dynamic_frame.from_catalog( database='sparkify', table_name='song_data') print ('Count: ', song_df.count()) print('Schema: ') song_df.printSchema() except Exception as e: print(e) try: print('Attempt 3: create_dynamic_frame_from_catalog') song_df = glueContext.create_dynamic_frame_from_catalog( database='sparkify', table_name='song_data') print ('Count: ', song_df.count()) print('Schema: ') song_df.printSchema() except Exception as e: print(e) # l_history = Join.apply(orgs, # Join.apply(persons, memberships, 'id', 'person_id'), # 'org_id', 'organization_id').drop_fields(['person_id', 'org_id']) # print "Count: ", l_history.count() # l_history.printSchema()
def parse_user_cards(cards_table=TABLE1, user_scores_table=TABLE2, users_table = TABLE3, db=MYSQL_DB, db_users_table=MYSQL_USERS_TABLE, db_cards_table=MYSQL_CARDS_TABLE): """ Description Arguments: ---------- cards_table (str): "cards" table in the Influx Database user_scores_table (str): "user_scores" table in the Influx Database user_table (str): "users" table in the Influx Database db (str): Name of MySQL Database db_users_table (str): Name of "users" table in MySQL Database db_cards_table(str): Name of Cards table in Mysql Database Returns: -------- (str): """ glueContext = GlueContext(SparkContext.getOrCreate()) # Getting data from "user_scores" table and "cards" table influx_user_scores_df = get_influx_dataframe(table=user_scores_table) influx_cards_df = get_influx_dataframe(table=cards_table) influx_users_df = get_influx_dataframe(table=users_table) # Dropping duplicates from influx_users_df and influx_cards_df influx_cards_df = influx_cards_df.drop_duplicates(['org_id', 'user_id', 'card_id', 'assigned_to_user_id']) influx_users_df = influx_users_df.drop_duplicates(["org_id", "user_id", "created_user_id", "follower_id", "followed_user_id", "time", "event_time"]) # Ranking based on "org_id" influx_users_df = influx_users_df.withColumn("rank_org_id", rank().over(Window.partitionBy("org_id")\ .orderBy(asc("time"))))\ .where(col("rank_org_id") >= 1) # .select("rank_org_id", "org_id", "actor_id") # Creating a DynamicFrame for "users" and "cards" from the schema of MySQL Database cards = glueContext.create_dynamic_frame_from_catalog(database=db, table_name=db_cards_table) users = glueContext.create_dynamic_frame_from_catalog(database=db, table_name=db_users_table) # Converting "users" and "cards" to PySpark DataFrame users_df = users.toDF()\ .drop_duplicates(["organization_id", "id", "created_at"]) # Duplicates with the following keys for the following actions are dropped: # 1. card_created 2. card_deleted card_assigned 3. card_assigned_dismissed # 4. card_assigned_deleted 5. card_dismissed 6. card_marked_as_complete # 7. card_marked_as_uncomplete 8. card_viewed cards_df = cards.toDF()\ .drop_duplicates(['organization_id', 'author_id', 'id']) # Selecting given columns from "users_df" from total of 62 columns and "cards_df" from # total of 55 columns users_df = users_df.select("id", "first_name", "last_name", "email", "created_at", "sign_in_count", "is_suspended", "is_active") cards_df = cards_df.select("id", "card_type","title") # Concatenating "first_name" and "last_name" to create new column "full_name" in users_df users_df = users_df.withColumn("user_full_name", concat_ws(" ", users_df.first_name, users_df.last_name)) # Adding "user_account_status" based on values for mutually # exclusive columns "is_suspended" and "is_active" users_df = users_df.withColumn("user_account_status", when(users_df.is_suspended == "true", "suspended")\ .when(users_df.is_active == "true", "active")) # Joining users from MYSQLDB and Influx on user_id joined_users_df = users_df.join(influx_users_df, users_df.id == influx_users_df._user_id, 'inner')\ .drop(influx_users_df._user_id).drop(influx_users_df.org_id) # Joining cards from MYSQLDB and Influx on card_id joined_cards_df = cards_df.join(influx_cards_df, cards_df.id == influx_cards_df._card_id, 'inner')\ .drop(influx_cards_df._card_id).drop(influx_cards_df.card_type)\ .drop(influx_cards_df.ecl_id).drop(influx_cards_df.ecl_source_name)\ .drop(influx_cards_df.is_public).drop(influx_cards_df.readable_card_type)\ .drop(influx_cards_df._user_id) print("Joining \'users\' table and \'cards\' tables with their corresponding influx tables completed!") # The columns in these df's will change based on further activity measures processed_users_df = joined_users_df.select("user_id", "user_full_name", "rank_org_id") #processed_users_df = joined_users_df.select("user_id", "user_full_name") processed_cards_df = joined_cards_df.select("org_id", "card_title","card_type", "user_id", "card_id", "event", "event_time","time") # Joined users with cards (each of which have already been joined with their corresponding influx tables) joined_users_cards_df = processed_users_df.join(processed_cards_df, processed_users_df.user_id == processed_cards_df.user_id, 'inner')\ .drop(processed_users_df.user_id) joined_users_cards_df.show(1000)
sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) args = getResolvedOptions( sys.argv, ['JOB_NAME', "database_name", "raw_table_name", "table_name"]) job.init(args['JOB_NAME'], args) database_name = args["database_name"] raw_table_name = args["raw_table_name"] table_name = args["table_name"] mappings = [("impression_id", "string", "impression_id", "string"), ("click_id", "string", "click_id", "string"), ("ad_id", "string", "ad_id", "string"), ("occurred_at", "string", "occurred_at", "timestamp"), ("partition_0", "string", "year", "int"), ("partition_1", "string", "month", "int")] events = glueContext.create_dynamic_frame_from_catalog(database_name, raw_table_name, transformation_ctx = "events-etl") \ .apply_mapping(mappings) \ .resolveChoice(choice = "MATCH_CATALOG", database = database_name, table_name = table_name) glueContext.write_dynamic_frame.from_catalog( events, database_name, table_name, additional_options={"partitionKeys": ["year", "month"]}) job.commit()
#ccn-alerts-104328-{workspace}-sfmc-migration workspace = args['workspace'] bucketName = args['bucketName'] jdbc_url = args['jdbc_url'] username = args['username'] pswd = args['pswd'] print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username)) s3Location = 's3://{}'.format(bucketName) # workspace can be any of dev, qa-r-qa2, psp, prod glue_database = '{}_sfmc_migration'.format(workspace) customer_alert_preference_df = glueContext.create_dynamic_frame_from_catalog( glue_database, "customer_alert_preference") customer_alert_preference_df.printSchema() customer_alert_preference_df = customer_alert_preference_df.toDF() #customer_alert_preference_df.show(truncate=False) customer_alert_preference_db_df = spark.read.jdbc(jdbc_url, "CUSTOMER_ALERT_PREFERENCE", properties={ "user": username, "password": pswd, "driver": 'com.mysql.jdbc.Driver' })
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'bucket', 'db', 'table']) now = datetime.now() print(f"Start Time : {now.strftime('%Y-%m-%d %H:%M:%S')}") db = args['db'] table = args['table'] bucket = args['bucket'] target = f"s3://{bucket}/parquet/{table}/year={now.strftime('%Y')}/month={now.strftime('%m')}/day={now.strftime('%d')}/" sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) # Read files df = glueContext.create_dynamic_frame_from_catalog(database=db, table_name=table) # Write data to folder out = glueContext.write_dynamic_frame.from_options( frame=df, connection_type="s3", connection_options={"path": target}, format="parquet") end = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(f"End Time : {end}") job.commit()