/
etl.py
218 lines (175 loc) · 7.89 KB
/
etl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import udf, col, row_number
from pyspark.sql.functions import year, month, dayofmonth,\
hour, weekofyear, date_format
from pyspark.sql.types import TimestampType, StructType, \
StructField, IntegerType, StringType, FloatType
config = configparser.ConfigParser()
config.read('dl.cfg')
os.environ['AWS_ACCESS_KEY_ID'] = \
config.get('CREDENTIALS', 'AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = \
config.get('CREDENTIALS', 'AWS_SECRET_ACCESS_KEY')
def create_spark_session():
"""Returns a session to execute against the AWS EMR cluster
If no session already exists, a new session is built
and configured with aws hadoop packages. Otherwise, the available session
is returned.
:return:
SparkSession: The entry point to programming Spark with the Dataset
and DataFrame API.
"""
spark = SparkSession \
.builder \
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
.appName("sparkifyDataLake") \
.getOrCreate()
return spark
# Define the schema of the song data for a faster loading into spark
schema = StructType([
StructField("num_songs", IntegerType(), True),
StructField("artist_id", StringType(), True),
StructField("artist_latitude", StringType(), True),
StructField("artist_longitude", StringType(), True),
StructField("artist_location", StringType(), True),
StructField("artist_name", StringType(), True),
StructField("song_id", StringType(), True),
StructField("title", StringType(), True),
StructField("duration", FloatType(), True),
StructField("year", IntegerType(), True),
])
def process_song_data(spark, input_data, output_data):
"""Extracts, transforms song data available in s3 and
loads it into two tables: songs_table, artists_table. The tables
are stored as parquet files in s3.
Args:
SparkSession: The entry point to programming Spark with the Dataset
and DataFrame API.
input_data: A path to the songs_data
output_data: the location where songs and artists tables will be stored
"""
# get filepath to song data file
song_data = input_data + "song_data/*/*/*/*.json"
# read song data file
df = spark.read.json(song_data, schema)
# extract columns to create songs table
songs_table_cols = ["song_id",
"title",
"artist_id",
"year",
"duration"]
songs_table = df[songs_table_cols].na.\
drop(subset=["song_id"]).dropDuplicates(subset=["song_id"])
# write songs table to parquet files partitioned by year and artist
output_songs = output_data+"songs"
songs_table.write.mode("overwrite").partitionBy("year", "artist_id").\
parquet(output_songs)
artists_cols = ["artist_id",
"artist_name",
"artist_location",
"artist_latitude",
"artist_longitude"]
# Rank the artists in order to be able to choose one in case of duplicates
window = Window.partitionBy("artist_id").orderBy("year")
ranked_artists = df.withColumn("row_num", row_number().over(window))
# extract columns to create artists table
artists_table = ranked_artists.filter(ranked_artists.row_num == 1).\
select(artists_cols)
# write artists table to parquet files
output_artists = output_data+"artists"
artists_table.write.mode("overwrite").parquet(output_artists)
def process_log_data(spark, input_data, output_data):
"""Extracts, transforms both log and song data available in s3, and
loads it into three tables: users, time and songplays table. The tables
are stored as parquet files in s3.
Args:
SparkSession: The entry point to programming Spark with the Dataset
and DataFrame API.
input_data: A path to the log data
output_data: location where users, time and songplays tables will
be stored.
"""
# get filepath to log data file
log_data = input_data + "log_data/*/*/*.json"
song_data = input_data + "song_data/*/*/*/*.json"
# read log data file
df = spark.read.json(log_data)
# filter by actions for song plays
df = df.filter(df.page == 'NextSong')
# extract columns for users table
# first we get latest users. i.e. the row with the maximum
# timestamp for a certain user_id
users_cols = [
col("userId").alias("user_id"),
col("firstName").alias("first_name"),
col("lastName").alias("last_name"),
"gender",
"level"]
latest_visits = df.na.drop(subset=["userId"]). \
groupBy(df.userId).max("ts"). \
select(col("max(ts)").alias("max_ts")).collect()
max_ts_values = [row.max_ts for row in latest_visits]
users_table = df.filter(df.ts.isin(max_ts_values)).select(users_cols)
# write users table to parquet files
output_users = output_data+"users"
users_table.write.mode("overwrite").parquet(output_users)
# create timestamp column from original timestamp column
get_timestamp = udf(lambda x: x/1000)
df = df.withColumn('timestamp', get_timestamp('ts'))
# create a datetime column from the original timestamp column
get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
df = df.withColumn('start_time', get_datetime('timestamp'))
# extract columns to create time table
time_table = df.select("start_time",
hour("start_time").alias("hour"),
dayofmonth("start_time").alias("day"),
weekofyear("start_time").alias("week"),
month("start_time").alias("month"),
year("start_time").alias("year"),
date_format("start_time", 'u').alias("weekday").
cast(IntegerType())
)
# write time table to parquet files partitioned by year and month
output_time = output_data+"time"
time_table.write.mode("overwrite").partitionBy("year", "month").\
parquet(output_time)
# read in song data to use for songplays table
song_df = spark.read.json(song_data, schema)
# extract columns from joined song and log datasets to create
# songplays table
songplays_cols = ["start_time",
col("userId").alias("user_id"),
"level",
"song_id",
"artist_id",
col("sessionId").alias("session_id"),
"location",
col("userAgent").alias("user_agent"),
year(col("start_time")).alias("year"),
month(col("start_time")).alias("month")
]
songplays_table = df.join(song_df,
((df.song == song_df.title)
& (df.artist == song_df.artist_name)
& (df.length == song_df.duration)),
how='left'
).select(songplays_cols)
# write songplays table to parquet files partitioned by year and month
output_songplays = output_data + "songplays"
songplays_table.write.mode("overwrite").partitionBy("year", "month").\
parquet(output_songplays)
def main():
spark = create_spark_session()
# reducing shuffle partitions from the default 200
# to better match our data size
spark.conf.set("spark.sql.shuffle.partitions", 64)
input_data = "s3a://udacity-dend/"
output_data = ""
process_song_data(spark, input_data, output_data)
process_log_data(spark, input_data, output_data)
spark.stop()
if __name__ == "__main__":
main()