def process_song_data(spark, input_data, output_data): """ Description: This function helps us to read the song_data from S3, put this data into spark dataframe ,extract columns from this dataframe to form "songinf table" and "artist table", transform "songinf table data" and "artist table data" into a format that this project needs. Parameters: -spark: spark session -input_data: location of song_data json file (in S3 bucket) -output_data: location that the final table will be saved (in S3 bucket) Return: None """ #--------------------read song data--------------------# print('Read song_data...') # get filepath to song data file song_data = os.path.join(input_data, "song_data_*.json") # define the song data schema for reading SongSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Doub()), Fld("artist_location",Str()), Fld("artist_longitude",Doub()), Fld("artist_name",Str()), Fld("duration",Doub()), Fld("num_songs",Long()), Fld("song_id",Str()), Fld("title",Str()), Fld("year",Long()) ]) # read song data file song_df = spark.read.json(song_data, schema=SongSchema) #--------------------deal with song table--------------------# # extract columns to create songinf df songinf_df = song_df.select(['song_id', 'title', 'artist_id', 'year', 'duration']) songinf_df = songinf_df.dropDuplicates(['song_id']) songinf_df = songinf_df.dropna(how = "any", subset = ["song_id"]) songinf_df = songinf_df.filter(songinf_df.song_id != "") print('Songs table: ') print(songinf_df.sort('song_id').show(5)) # write songs table to parquet files partitioned by year and artist print('Save Songs table into S3...') songinf_df.write.partitionBy("year", "artist_id").parquet("{}/song_table.parquet".format(output_data)) #--------------------deal with artists table--------------------# # extract columns to create artists df artist_df = song_df.select(['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']) artist_df = artist_df.dropDuplicates(['artist_id']) artist_df = artist_df.dropna(how = "any", subset = ["artist_id"]) artist_df = artist_df.filter(artist_df.artist_id != "") print('artists table: ') print(artist_df.sort('artist_id').show(5)) # write artists table to parquet files print('Save artists table into S3...') artist_df.write.parquet("{}/artist_table.parquet".format(output_data))
def process_song_data(spark, input_data, output_data): """ The function loads data from song_data dataset and extract columns for songs and artist tables and write the data into parquet files which will be loaded on s3. """ song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) # get filepath to song data file song_data = 'song_data/*/*/*/*.json' # read song data file df = spark.read.json(os.path.join(input_data, song_data), schema=song_schema) # extract columns to create songs table songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration').dropDuplicates() songs_table.createOrReplaceTempView('songs') # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet( os.path.join(output_data, 'songs/songs.parquet'), 'overwrite') # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') \ .withColumnRenamed('artist_name', 'name') \ .withColumnRenamed('artist_location', 'location') \ .withColumnRenamed('artist_latitude', 'latitude') \ .withColumnRenamed('artist_longitude', 'longitude') \ .dropDuplicates() artists_table.createOrReplaceTempView('artists') # write artists table to parquet files artists_table.write.parquet( os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
def get_log_schema(): """ Creates a schema for log data. :return: schema """ log_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Str()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Str()), Fld("song", Str()), Fld("status", Str()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) return log_schema
def create_log_data(): """ Create schema for log data. return: schema """ log_schema = StructType([ StructField("artist", Str()), StructField('auth', Str()), StructField('firstName', Str()), StructField('gender', Str()), StructField('itemInSession', Int()), StructField('lastName', Str()), StructField('length', Dbl()), StructField('level', Str()), StructField('location', Str()), StructField('method', Str()), StructField('page', Str()), StructField('registration', Dec()), StructField('sessionId', Int()), StructField('song', Str()), StructField('status', Int()), StructField('ts', Long()), StructField('userAgent', Str()), StructField('userId', Int()) ]) return log_schema
def process_song_data(spark, input_data_songs, output_data): """ Read song data by providing it an expected schema. Create songs and artists tables. """ # define song data schema to improve performance song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) song_data = input_data_songs t_start = time() dfSongs = spark.read.json(song_data, schema=song_schema) t_end = time() - t_start print('Read song data in {} secs'.format(t_end)) dfSongs.printSchema() dfSongs.count() dfSongs.show(5) songs_table = dfSongs.filter(dfSongs.song_id != '')\ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) songs_table.show(5) songs_table.write.partitionBy( "year", "artist_id").mode('overwrite').parquet(output_data + 'songs/songs_table.parquet') artists_table = dfSongs.filter(dfSongs.artist_id !='') \ .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\ .dropDuplicates() artists_table.show(5) artists_table.write.mode('overwrite').parquet( output_data + 'artists/artists_table.parquet')
def process_immigration_data(spark, path): us_immg_df = spark.read.parquet(path) us_immg_df=us_immg_df.select(col("i94res").cast(Int()),col("i94port"), col("arrdate").cast(Int()), \ col("i94mode").cast(Int()),col("depdate").cast(Int()), col("i94bir").cast(Int()),col("i94visa").cast(Int()), col("count").cast(Int()), \ "gender",col("admnum").cast(Long())) us_immg_df = us_immg_df.dropDuplicates() travel_mode = get_travel_mode(spark) visa_type = get_visa_type(spark) ports = get_ports_data(spark) us_immg_df = us_immg_df.join(ports, us_immg_df.i94port == ports.id, how='left') us_immg_df = us_immg_df.withColumn("arrival_date", get_date(us_immg_df.arrdate)) us_immg_df = us_immg_df.drop("id") get_timestamp(spark, us_immg_df)
def process_log_data(spark, input_data, output_data): """ Description: This function helps us to read the log_data from S3, put this data into spark dataframe ,extract columns from this dataframe to form "user table" and "time table", transform "user table data" and "time table data" into a format that this project needs. Parameters: -spark: spark session -input_data: location of log_data json file (in S3 bucket) -output_data: location that the final table will be saved (in S3 bucket) Return: None """ #--------------------read log data--------------------# print('Read log_data...') # get filepath to log data file log_data = os.path.join(input_data, "log_data_*.json") # define the log data schema for reading LogSchema = R([ Fld("artist",Str()), Fld("auth",Str()), Fld("firstName",Str()), Fld("gender",Str()), Fld("itemInSession",Long()), Fld("lastName",Str()), Fld("length",Doub()), Fld("level",Str()), Fld("location",Str()), Fld("method",Str()), Fld("page",Str()), Fld("registration",Doub()), Fld("sessionId",Long()), Fld("song",Str()), Fld("status",Long()), Fld("ts",Long()), Fld("userAgent",Str()), Fld("userId",Str()), ]) # read log data file log_df = spark.read.json(log_data) #--------------------deal with user table--------------------# # extract columns for user df and drop the duplicated value and None value user_df = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level']) user_df = user_df.dropDuplicates(['userId']) user_df = user_df.dropna(how = "any", subset = ["userId"]) user_df = user_df.filter(user_df.userId != "") print('User table: ') print(user_df.sort('userId').show(5)) # write users table to parquet files print('Save User table into S3...') user_df.write.parquet("{}/user_table.parquet".format(output_data)) #--------------------deal with time table--------------------# # define convert_timestamp function for convert timestamp format to datetime format def convert_timestamp(x): datetime_data = datetime.fromtimestamp(x/1000) return datetime_data # register convert_timestamp function for spark session by udf convert_timestamp_udf=udf(convert_timestamp, TimestampType()) # extract columns for time df and drop the duplicated value and None value time_df = log_df.select(['ts']) time_df = time_df.dropDuplicates(['ts']) time_df = time_df.dropna(how = "any", subset = ["ts"]) # use udf function defined above to convert the value in "ts column" into datetime format time_df = time_df.withColumn('start_time',convert_timestamp_udf('ts')) # use pyspark.sql.functions to extract hour, day, week, month, year, weekday time_df = time_df.withColumn('hour', F.hour('start_time')) time_df = time_df.withColumn('day', F.dayofmonth('start_time')) time_df = time_df.withColumn('week', F.weekofyear('start_time')) time_df = time_df.withColumn('month', F.month('start_time')) time_df = time_df.withColumn('year', F.year('start_time')) time_df = time_df.withColumn('weekday', F.dayofweek('start_time')) # select the necessary column for final time table time_df = time_df.select(['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']) print('Time table: ') print(time_df.show(5)) # write time table to parquet files partitioned by year and month print('Save Time table into S3...') time_df.write.partitionBy("year", "month").parquet("{}/time_table.parquet".format(output_data)) #--------------------deal with song_play table--------------------# # since song_play table need the join result of song_data and log_data, we need to read song_data again. song_data = os.path.join(input_data, "song_data/A/*/*/*.json") # define the song data schema for reading SongSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Doub()), Fld("artist_location",Str()), Fld("artist_longitude",Doub()), Fld("artist_name",Str()), Fld("duration",Doub()), Fld("num_songs",Long()), Fld("song_id",Str()), Fld("title",Str()), Fld("year",Long()) ]) # read song data file song_df = spark.read.json(song_data, schema=SongSchema) # filter by actions for song plays log_df_filter = log_df.where(log_df.page == 'NextSong') # create start_time column by using udf function defined above log_df_filter = log_df_filter.withColumn('start_time',convert_timestamp_udf('ts')) # join song_df and log_df_filter cond = [log_df_filter.artist == song_df.artist_name, log_df_filter.song == song_df.title, log_df_filter.length == song_df.duration] songplay_df = log_df_filter.join(song_df, cond) \ .select([F.monotonically_increasing_id().alias('songplay_id'), log_df_filter.start_time, log_df_filter.userId, log_df_filter.level, song_df.song_id, song_df.artist_id, log_df_filter.sessionId, log_df_filter.location, log_df_filter.userAgent]) print('Song_play table: ') print(songplay_df.show(10)) # write songplays table to parquet files partitioned by year and month songplay_df.write.partitionBy("year", "month").parquet("{}/songplay_table.parquet".format(output_data))
def process_log_data(spark, input_data, output_data, songs_data): """ Process log_data json files which located in S3 Create table users, time and song_plays songs_data will be needed in creation song_plays table Store the table in parque format in S3 Args: spark : Spark Session input_data (string) : location json files (input) output_data (string) : location parque files (output) songs_data (Spark Dataframe) : Song Data tables Returns: None """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # set schema log data logSchema = StructType([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Int()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Int()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()), ]) # read log data file df = spark.read.json(log_data, schema=logSchema) # filter by actions for song plays df = df.filter("page == 'NextSong'") # create temporary View for Log df.createOrReplaceTempView("logView") # extract columns for users table users_table = spark.sql(""" WITH latestChange AS ( SELECT userId AS userIdLatest, MAX(ts) AS maxTs FROM logView GROUP BY userId ) SELECT userId AS user_id, ts AS tsTemp, firstName AS first_name, lastName AS last_name, gender, level FROM logView AS t1 JOIN latestChange AS t2 ON t1.userId = t2.userIdLatest AND t1.ts = t2.maxTs WHERE userId IS NOT NULL """).dropDuplicates(['user_id']).drop("tsTemp") # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: getDateTime(int(x)), TST()) df = df.withColumn("start_time", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: getDateTime(int(x)), Dat()) df = df.withColumn("date_time", get_datetime(df.ts)) # extract columns to create time table time_table = df.select(col("start_time"), hour(df.start_time).alias("hour"), dayofmonth(df.date_time).alias("day"), weekofyear(df.date_time).alias("week"), month(df.date_time).alias("month"), year(df.date_time).alias("year"), date_format(df.date_time,"W").alias("weekday") ) \ .where("start_time is not null") \ .dropDuplicates(['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, 'time'), 'overwrite') # create temporary View for Log and Song tables df.createOrReplaceTempView("logView") songs_data.createOrReplaceTempView("songView") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT start_time, year(date_time) AS year, month(date_time) AS month, userId AS user_id, level, song_id, artist_id, sessionId AS session_id, location, userAgent AS user_agent FROM logView AS t1 JOIN songView AS t2 ON (t1.artist = t2.artist_name) AND (t1.song = t2.title) AND (t1.length = t2.duration) """) songplays_table.show(10) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, input_data, output_data): ''' Description: This function can be used to load the log data from the input S3 bucket and write the parquet files to the output S3 bucket. Arguments: spark: SparkSession input_data: location for the input data output_data: location for the output data Returns: None ''' # get filepath to log data file log_data = os.path.join(input_data, "log_data/*.json") print(log_data) logsSchema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Int()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Long()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) # read log data file df = spark.read.json(log_data, schema=logsSchema).distinct() #df = spark.read.json(log_data) print('df.count', df.count()) print(df.show(5, truncate=False)) df.printSchema() # filter by actions for song plays dfSongPlays = df.filter("page == 'NextSong'") # extract columns for users table dfSongPlays.createOrReplaceTempView("dfSongPlays") users_table = spark.sql( "select userId as user_id, firstName as first_name, lastName as last_name, gender, level from dfSongPlays" ).distinct() print('users_table.count', users_table.count()) # write users table to parquet files users_table.repartitionByRange( 3, "user_id").write.mode('overwrite').parquet(output_data + "users") # create timestamp column from original timestamp column dfWithDatetime = dfSongPlays.withColumn( 'datetime', from_unixtime(dfSongPlays.ts / 1000)) print('after adding datetime') dfWithDatetime.show(5, truncate=False) # extract columns to create time table dfWithDatetime.createOrReplaceTempView("dfWithDatetime") time_table = spark.sql(""" select ts as start_time, hour(datetime) as hour, dayofmonth(datetime) as day, weekofyear(datetime) as week, month(datetime) as month, year(datetime) as year, dayofweek(datetime) as weekday from dfWithDatetime """).distinct() time_table.show(5, truncate=False) # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy( "year", "month").parquet(output_data + "time") # read in song data to use for songplays table song_data = os.path.join(input_data, "song_data/*/*/*/*.json") # read song data file songsSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) song_df = spark.read.json(song_data, schema=songsSchema).distinct() # extract columns from joined song and log datasets to create songplays table songplays_df = dfSongPlays.join(song_df, (dfSongPlays.artist == song_df.artist_name) & (dfSongPlays.song == song_df.title), how='left') \ .withColumn("songplay_id", monotonically_increasing_id()) \ .withColumn("year", year(from_unixtime(dfSongPlays.ts/1000))) \ .withColumn("month", month(from_unixtime(dfSongPlays.ts/1000))) # Could also use 'left_outer' songplays_df.show(5) songplays_df.createOrReplaceTempView("songplays_df") songplays_table = spark.sql(""" select songplay_id, ts as start_time, userId as user_id, level, song_id, artist_id, sessionId as session_id, location, userAgent as user_agent, year, month from songplays_df """).distinct() songplays_table.show(5) # write songplays table to parquet files partitioned by year and month songplays_table = songplays_table.write.mode('overwrite').partitionBy( "year", "month").parquet(output_data + "songplays")
def process_log_data(spark, input_data, output_data): """ Extract data from log data and write users, time and songplays table Arguments: - spark : SparkSession object - input_data : input data root dir path - output_data : output data root dir path """ # get filepath to log data file log_data = input_data + "log_data/*" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == "NextSong") # extract columns for users table df.createOrReplaceTempView("log_data") users_table = spark.sql(""" SELECT lg.userId as user_id, lg.firstName as first_name, lg.lastName as last_name, lg.gender, lg.level FROM log_data lg JOIN ( SELECT userId, MAX(ts) as latest FROM log_data GROUP BY userId ) max_ts ON lg.userId = max_ts.userId WHERE lg.ts = max_ts.latest """).dropDuplicates(["user_id"]) # write users table to parquet files users_table.write_parquet(output_data + "users_table") # create timestamp column from original timestamp column get_timestamp = udf(lambda x : (x // 1000), Long()) df = df.withColumn("unix_timestamp", get_timestamp("ts")) # create datetime column from original timestamp column get_datetime = udf(lambda x : datetime.fromtimestamp(x), Timestamp()) df = df.withColumn('datetime', get_datetime("unix_timestamp")) # extract columns to create time table df.createOrReplaceTempView("log_data_time") time_table = spark.sql(""" SELECT unix_timestamp as start_time, EXTRACT(hour from datetime) as hour, EXTRACT(day from datetime) as day, EXTRACT(week from datetime) as week, EXTRACT(month from datetime) as month, EXTRACT(year from datetime) as year, dayofweek(datetime) as weekday FROM log_data_time """).dropDuplicates(["start_time"]) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + "time_table") # read in song data to use for songplays table song_df = spark.read.json(input_data + "song_data/*/*/*", schema=songSchema) # extract columns from joined song and log datasets to create songplays table song_df.createOrReplaceTempView("song_data") time_table.createOrReplaceTempView("time_table") songplays_table = spark.sql(""" SELECT unix_timestamp as start_time, t.year as year, t.month as month, userId, level, song_id, artist_id, sessionId, location, userAgent FROM log_data_time lg LEFT JOIN song_data s ON (lg.song = s.title and lg.artist = s.artist_name) LEFT JOIN time_table t ON (lg.unix_timestamp = t.start_time) """) songplays_table = songplays_table.withColumn('songplay_id', F.monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year","month").parquet(output_data + "songplays_table")
def process_log_data(spark, input_data, output_data): """ Process the event log data files from S3 input_data and create and extract `time` table, `users` table and `songplays` table data and write parquet file to S3 output_data. :param spark: a SparkSession instance :param input_data: input file path :param output_data: output file path """ # TODO: get filepath to log data file log_data = os.path.join(input_data, "log-data/*/*/*.json") # ================================= # read log data file # ================================= log_schema = StructType([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Long()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Long()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) logger.info("Start reading `log_data` json file(s)...") start_time = time() df = spark.read.json(log_data, schema=log_schema).drop_duplicates() logger.info("Finished reading 'log_data' json file(s)...") logger.info( "Reading 'log_data' json took: {0:.2f} seconds".format(time() - start_time)) # TODO: filter by actions for song plays df = df.filter(df.page == "NextSong") # ================================= # users_table # ================================= # TODO: extract columns for users table logger.info("Preparing `users` dataframe") user_columns = ["userId", "firstName", "lastName", "gender", "level"] users_table = df.select(user_columns) logger.info("Start exporting `users` parquet files...") start_time = time() # write users table to parquet files users_table.write \ .mode("overwrite") \ .parquet(output_data + "users/") logger.info("Finished exporting `users` parquet files") logger.info("Exporting `users` parquet files took: {0:.2f} seconds".format( time() - start_time)) # TODO: create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.utcfromtimestamp(x / 1000.0), TimestampType()) df = df.withColumn("ts_timestamp", get_timestamp("ts")) # TODO: create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.utcfromtimestamp(x / 1000.0). strftime('%Y-%m-%d %H:%M:%S')) df = df.withColumn("ts_datetime", get_datetime("ts")) # ================================= # time_table # ================================= # TODO: extract columns to create time table logger.info("Preparing `time` dataframe") time_column = [ "ts", "ts_datetime as start_time", "hour", "day", "week", "month", "year", "weekday" ] time_table = df.withColumn("hour", hour("ts_timestamp")) \ .withColumn("day", dayofmonth("ts_timestamp")) \ .withColumn("week", weekofyear("ts_timestamp")) \ .withColumn("month", month("ts_timestamp")) \ .withColumn("year", year("ts_timestamp")) \ .withColumn("weekday", dayofweek("ts_timestamp")) \ .selectExpr(time_column).drop_duplicates() logger.info("Start exporting `time` parquet files...") start_time = time() # TODO: write time table to parquet files partitioned by year and month time_table.write \ .mode('overwrite') \ .partitionBy('year', 'month') \ .parquet(output_data + "time/") logger.info("Finished exporting `time` parquet files") logger.info("Exporting `time` parquet files took: {0:.2f} seconds".format( time() - start_time)) # ================================= # songplays_table # ================================= # read in song data to use for songplays table song_input = os.path.join(input_data, "song-data/*/*/*/*.json") logger.info("Start reading `song_df` json file(s)...") song_df = spark.read.json(song_input) # TODO: extract columns from joined song and log datasets to create \ # songplays table join with song_df logger.info("Start joining `song_df` and log datasets...") songplays_table = df.join(song_df, [song_df.title == df.song, song_df.artist_name == df.artist], how='inner') \ .select([monotonically_increasing_id().alias("songplay_id"), col("ts_datetime").alias("start_time"), "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent"]) # TODO: join with time_table to extract month and year songplays_table = songplays_table.join(time_table, [songplays_table.start_time == time_table.start_time], how='inner')\ .select( "songplay_id", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year" ) logger.info("Start exporting `songplays` parquet files...") start_time = time() # TODO: write songplays table to parquet files partitioned by year and month songplays_table.write \ .mode('overwrite') \ .partitionBy('year', 'month') \ .parquet(output_data + "songplays/") logger.info("Finished exporting `songplays` parquet files") logger.info( "Exporting `songplays` parquet files took: {0:.2f} seconds".format( time() - start_time))
Fld("artist_latitude",Dec(10,5)), Fld("artist_location",Str()), Fld("artist_longitude", Dec(10,5)), Fld("artist_name",Str()), Fld("duration",Dec(10,5)), Fld("num_songs",Int()), Fld("song_id", Str()), Fld("title",Str()), Fld("year",Int()), ]) logSchema = R([ Fld("artist",Str()), Fld("auth", Str()), Fld("firstName",Str()), Fld("gender", Str()), Fld("itemInSession",Int()), Fld("lastName",Str()), Fld("length",Dec(10,5)), Fld("level", Str()), Fld("location",Str()), Fld("method",Str()), Fld("page",Str()), Fld("registration", Dbl()), Fld("sessionId",Int()), Fld("song", Str()), Fld("status",Str()), Fld("ts",Long()), Fld("userAgent",Str()), Fld("userId", Str()), ])
def process_log_data(spark, input_data, output_data): ''' load log data in json format from S3 bucket and process these data by extracting users table, time table and songplays table, and save these tables back to S3 bucket :param spark: spark session :param input_data: data location for input data :param output_data: data location for output data :return: no return value ''' # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # log_data = input_data + 'log_data/*.json' # for local files logdataSchema = R([ Fld('artist', Str()), Fld('auth', Str()), Fld('firstName', Str()), Fld('gender', Str()), Fld('itemInSession', Long()), Fld('lastName', Str()), Fld('length', Dbl()), Fld('level', Str()), Fld('location', Str()), Fld('method', Str()), Fld('page', Str()), Fld('registration', Dbl()), Fld('sessionId', Long()), Fld('song', Str()), Fld('status', Long()), Fld('ts', Long()), Fld('userAgent', Str()), Fld('userId', Str()), ]) # load json files from S3 df_log = spark.read.json(log_data, schema=logdataSchema) df_log = df_log.filter(df_log.page == 'NextSong') # select users columns users_attr = ['userId', 'firstName', 'lastName', 'gender', 'level'] users_table = df_log.select(users_attr)\ .dropDuplicates() users_table = users_table\ .withColumnRenamed('userId','user_id')\ .withColumnRenamed('firstName','first_name')\ .withColumnRenamed('lastName','last_name') # write users table to S3 users_table.write.parquet(output_data + 'users/') # create time table tsFormat = 'yyyy-MM-dd HH:MM:ss z' time_table = df_log.withColumn('ts', to_timestamp(date_format((df_log.ts/1000)\ .cast(dataType=Tst()), tsFormat), tsFormat)) time_table = time_table.select( col('ts').alias('start_time'), hour(col('ts')).alias('hour'), dayofmonth(col('ts')).alias('day'), weekofyear(col('ts')).alias('week'), month(col('ts')).alias('month'), year(col('ts')).alias('year')) # write time table to S3 time_table.write.partitionBy('year', 'month').parquet(output_data + 'time/') # load songs and artist tables from previous handling df_songs = spark.read.parquet(output_data + 'songs/*/*/*') df_artists = spark.read.parquet(output_data + 'artists/*') df_artists = df_artists.drop('location') # create songs_logs table songs_logs = df_log.join(df_songs, (df_log.song == df_songs.title)) # create artists_songs_logs table artists_songs_logs = songs_logs.join( df_artists, (songs_logs.artist == df_artists.name)) artists_songs_logs = artists_songs_logs\ .withColumn('ts', to_timestamp(date_format((artists_songs_logs.ts/1000)\ .cast(dataType=Tst()),tsFormat), tsFormat)) # create songplays table songplays = artists_songs_logs.join( time_table, artists_songs_logs.ts == time_table.start_time, 'left') songplays_attr = [ 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent', 'year', 'month' ] songplays_table = songplays.select(songplays_attr)\ .dropDuplicates() songplays_table = songplays_table\ .withColumnRenamed('userId','user_id')\ .withColumnRenamed('sessionId','session_id')\ .withColumnRenamed('userAgent','user_agent')\ .repartition('year', 'month') # write songplays table to S3 songplays_table.write.partitionBy('year', 'month').parquet(output_data + 'songplays/')
def process_log_data(spark, input_data, output_data): """ The function loads data from log_data dataset and extract columns for users and time tables, reads both the log_data and song_data datasets and extracts columns for songplays table with the data. It writes the data into parquet files which will be loaded on s3. Parameters """ song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) log_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Str()), Fld("lastName", Str()), Fld("length", Str()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Str()), Fld("sessionId", Str()), Fld("song", Str()), Fld("status", Str()), Fld("ts", Str()), Fld("userAgent", Str()), Fld("userId", Str()) ]) # get filepath to log data file log_data = 'log_data/*/*/*.json' # read log data file df = spark.read.json(os.path.join(input_data, log_data), schema=log_schema) # filter by actions for song plays actions_df = df.filter(df.page == 'NextSong') \ .select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() users_table.createOrReplaceTempView('users') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) actions_df = actions_df.withColumn('timestamp', get_timestamp(actions_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000))) actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts)) # extract columns to create time table time_table = actions_df.select('datetime') \ .withColumn('start_time', actions_df.datetime) \ .withColumn('hour', hour('datetime')) \ .withColumn('day', dayofmonth('datetime')) \ .withColumn('week', weekofyear('datetime')) \ .withColumn('month', month('datetime')) \ .withColumn('year', year('datetime')) \ .withColumn('weekday', dayofweek('datetime')) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time/time.parquet'), 'overwrite') # read in song data to use for songplays table song_data = 'song_data/*/*/*/*.json' song_df = spark.read.json(os.path.join(input_data, song_data), schema=song_schema) # extract columns from joined song and log datasets to create songplays table actions_df = actions_df.alias('log_df') song_df = song_df.alias('song_df') joined_df = actions_df.join( song_df, col('log_df.artist') == col('song_df.artist_name'), 'inner') songplays_table = joined_df.select( col('log_df.datetime').alias('start_time'), col('log_df.userId').alias('user_id'), col('log_df.level').alias('level'), col('song_df.song_id').alias('song_id'), col('song_df.artist_id').alias('artist_id'), col('log_df.sessionId').alias('session_id'), col('log_df.location').alias('location'), col('log_df.userAgent').alias('user_agent'), year('log_df.datetime').alias('year'), month('log_df.datetime').alias('month')) \ .withColumn('songplay_id', monotonically_increasing_id()) songplays_table.createOrReplaceTempView('songplays') # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')
def process_log_data(spark, input_data, output_data): """ Reads from log files, transforms them into users, time, and songplays data, and writes them in parquet format. params: - spark: spark session object - input_data: input data path - output_data: output data path """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # use schema when read json files log_schema = St([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Long()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Long()), Fld("song", Str()), Fld("status", Long()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) # read log data file df = spark.read.json(log_data, schema=log_schema) # filter by actions for song plays df = df.where("page='NextSong'") # extract columns for users table users_table = df.selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", \ "gender", "level").dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + "users", mode="overwrite") # change column name from ts to start_time time_table = df.select(col("ts").alias("start_time")).dropDuplicates() # convert datatype of start_time into datetime get_timestamp = udf(lambda ts: (datetime.fromtimestamp(ts // 1000)), Ts()) time_table = time_table.withColumn("start_time", get_timestamp("start_time")) # add columns to create time table time_table = time_table \ .withColumn("hour", hour("start_time"))\ .withColumn("day", date_format("start_time", "dd"))\ .withColumn("weekofyear", weekofyear("start_time"))\ .withColumn("month", month("start_time"))\ .withColumn("year", year("start_time"))\ .withColumn("weekday", dayofweek("start_time")) # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time", mode="overwrite") # read in song data to use for songplays and artists table song_df = spark.read.parquet(output_data + "songs") artist_df = spark.read.parquet(output_data + "artists") # extract columns from joined song and log datasets to create songplays table songplays_table = df \ .join(song_df, (df.song == song_df.title) & (df.length == song_df.duration))\ .join(artist_df, song_df.artist_id == artist_df.artist_id)\ .select(get_timestamp("ts").alias("start_time"), col("userId").alias("user_id"), df.level, song_df.song_id, artist_df.artist_id, col("sessionId").alias("session_id"), df.location, col("userAgent").alias("user_agent"))\ .dropDuplicates() # add year and month columns for partitioning songplays_table = songplays_table\ .withColumn("year", year("start_time"))\ .withColumn("month", month("start_time")) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays/songplays", mode="overwrite", partitionBy=["year", "month"])
def process_log_data(spark, input_data_logs, output_data): """ Read the log data using the expected schema. Create users, time and songplays tables. """ # create log data schema to improve performance log_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Long()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Long()), Fld("song", Str()), Fld("status", Long()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) log_data = input_data_logs t_start = time() dfLogs = spark.read.json(log_data, schema=log_schema) t_end = time() - t_start print('Read log data in {} secs'.format(t_end)) dfLogs.printSchema() dfLogs.count() dfLogs.show(5) # filter NextSong records dfNextSongLogs = dfLogs.filter(dfLogs.page == 'NextSong') users_table = dfNextSongLogs.filter(dfNextSongLogs.userId !='') \ .select(col("userId").alias("user_id"),col("firstName").alias("first_name"), col("lastName").alias("last_name"), col("gender"), col("level")) \ .dropDuplicates() users_table.show(20) users_table.write.mode('overwrite').parquet(output_data + 'users/users_table.parquet') get_timestamp = udf(lambda ms: datetime.fromtimestamp(ms / 1000.0), TimestampType()) dfNextSongLogs = dfNextSongLogs.withColumn('start_time', get_timestamp('ts')) time_table = dfNextSongLogs.select('start_time')\ .withColumn('hour',hour('start_time')).withColumn('day',dayofmonth('start_time'))\ .withColumn('week',weekofyear('start_time')).withColumn('month', month('start_time'))\ .withColumn('year', year('start_time')).withColumn('weekday',dayofweek('start_time')) time_table.show(5) time_table.write.partitionBy( "year", "month").mode('overwrite').parquet(output_data + 'time/time_table.parquet') dfSongs = spark.read.parquet(output_data + '/songs/') songplays_table = dfNextSongLogs.join(dfSongs, (dfNextSongLogs.song == dfSongs.title) & (dfNextSongLogs.length == dfSongs.duration), 'left_outer')\ .select( dfNextSongLogs.start_time, col("userId").alias('user_id'), dfNextSongLogs.level, dfSongs.song_id, dfSongs.artist_id, col("sessionId").alias("session_id"), dfNextSongLogs.location, col("useragent").alias("user_agent"), year('start_time').alias('year'), month('start_time').alias('month'))\ .withColumn("idx", monotonically_increasing_id()) songplays_table = songplays_table.filter( "song_id is not null and artist_id is not null") songplays_table.show(5) songplays_table.write.partitionBy( "year", "month").mode('overwrite').parquet(output_data + 'songplays/songplays_table.parquet')
if __name__ == "__main__": s3_bucket = sys.argv[1] s3_key = sys.argv[2] aws_key = sys.argv[3] aws_secret_key = sys.argv[4] redshift_conn_string = sys.argv[5] db_user = sys.argv[6] db_pass = sys.argv[7] spark = create_spark_session(aws_key, aws_secret_key) movies_schema = StructType([ Fld("adult", String()), Fld("belongs_to_collection", Long()), Fld("budget", Long()), Fld("genres", String()), Fld("homepage", String()), Fld("id", Int()), Fld("imdb_id", String()), Fld("original_language", String()), Fld("original_title", String()), Fld("overview", String()), Fld("popularity", Dbl()), Fld("poster_path", String()), Fld("production_company", String()), Fld("production_country", String()), Fld("release_date", Date()), Fld("revenue", Long()), Fld("runtime", Float()),
def process_song_data(spark, input_data, output_data): """ Process the songs data files from S3 input_data and create and extract `songs` table and `artist` table data and write parquet file to S3 output_data. :param spark: a SparkSession instance :param input_data: input file path :param output_data: output file path """ # TODO: get filepath to song data file song_data = os.path.join(input_data, "song-data/*/*/*/*.json") # ================================= # read song data file # ================================= # This schema is based on conducting data profiling song_schema = StructType([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()), ]) logger.info("Start reading `song_data` json file(s)...") start_time = time() df = spark.read.json(song_data, schema=song_schema).drop_duplicates() logger.info("Finished reading 'song_data' json file(s)...") logger.info( "Reading 'song_data' json took: {0:.2f} seconds".format(time() - start_time)) # ================================= # songs_table # ================================= # TODO: extract columns to create songs table logger.info("Preparing `songs` dataframe") song_columns = ["song_id", "title", "artist_id", "year", "duration"] songs_table = df.select(song_columns).drop_duplicates() logger.info("Start exporting `songs` parquet files...") start_time = time() # TODO: write songs table to parquet files partitioned by year and artist songs_table.write \ .mode("overwrite") \ .partitionBy('year', 'artist_id') \ .parquet(output_data + "songs/") logger.info("Finished exporting `songs` parquet files") logger.info("Exporting `songs` parquet files took: {0:.2f} seconds".format( time() - start_time)) # ================================= # artists_table # ================================= # TODO: extract columns to create artists table logger.info("Preparing `artists` dataframe") artist_columns = [ "artist_id", "artist_name as name", "artist_location as location", "artist_longitude as longitude", "artist_latitude as latitude" ] artists_table = df.selectExpr(artist_columns) logger.info("Start exporting `artists` parquet files...") start_time = time() # TODO: write artists table to parquet files artists_table.write \ .mode("overwrite") \ .parquet(output_data + "artists/") logger.info("Finished exporting `artists` parquet files") logger.info( "Exporting `artists` parquet files took: {0:.2f} seconds".format( time() - start_time))
def process_log_data(spark, input_data, output_data): """ Read the songplay log json files into parquet tables :param spark: spark session :type spark: SparkSession :param input_data: path (local or s3) to prefix to log_data root :type input_data: str :param output_data: path (local or s3) to write output parquet files to :type output_data: str :return: None :rtype: None """ # get filepath to log data file log_data = input_data + 'log_data/*.json' log_data_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Int()), Fld("lastName", Str()), Fld("length", Decimal()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Int(), nullable=False), Fld("song", Str()), Fld("status", Int()), Fld("ts", Long(), nullable=False), Fld("userAgent", Str()), Fld("userId", Str()) ]) # read log data file df = spark.read.json(log_data, schema=log_data_schema) # filter by actions for song plays df = df.where(col('page') == "NextSong") # extract columns for users table users_table = df.selectExpr('userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level') # write users table to parquet files users_table.write.parquet(output_data + 'users_table') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000.0, Dbl()) df = df.withColumn('epoch_ts', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimeStamp()) df = df.withColumn('dt', get_datetime(df.ts)) # extract columns to create time table time_table = df.selectExpr('dt as start_time', 'hour(dt) as hour', 'dayofmonth(dt) as day', 'weekofyear(dt) as week', 'month(dt) as month', 'year(dt) as year', 'dayofweek(dt) as weekday') # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + 'time_table') # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table cond = [df.artist == song_df.artist_name, df.song == song_df.title]#, df.length == song_df.duration] # Join on artist name and song title match songplays_table = df.join(song_df, cond, 'inner').selectExpr('dt as start_time', 'userId as user_id', 'level', 'song_id', 'artist_id', 'sessionId as session_id', 'location', 'userAgent as user_agent' ).withColumn('songplay_id', monotonically_increasing_id()) # For autoincrement primary key # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + 'songplays_table')