def create_log_data(): """ Create schema for log data. return: schema """ log_schema = StructType([ StructField("artist", Str()), StructField('auth', Str()), StructField('firstName', Str()), StructField('gender', Str()), StructField('itemInSession', Int()), StructField('lastName', Str()), StructField('length', Dbl()), StructField('level', Str()), StructField('location', Str()), StructField('method', Str()), StructField('page', Str()), StructField('registration', Dec()), StructField('sessionId', Int()), StructField('song', Str()), StructField('status', Int()), StructField('ts', Long()), StructField('userAgent', Str()), StructField('userId', Int()) ]) return log_schema
def get_song_schema(): """ Creates a schema for song data. :return: schema """ song_schema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dec()), Fld("artist_longitude", Dec()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) return song_schema
def create_song_schema(): """ Create schema for song data. return: schema """ song_schema = StructType([ StructField("num_songs", Int()), StructField("artist_id", Str()), StructField("artist_latitude", Dec()), StructField("artist_longitude", Dec()), StructField("artist_location", Str()), StructField("artist_name", Str()), StructField("song_id", Str()), StructField("title", Str()), StructField("duration", Dbl()), StructField("year", Int()) ]) return song_schema
def get_song_schema(): """ Description: Creates a schema for song data. Arguments: None Returns: song dataset schema """ song_schema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dec()), Fld("artist_longitude", Dec()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) return song_schema
def process_song_data(spark, input_data, output_data, aws=True): """ Process song data by iterating over the .json files in the song_data folder in the input_data folder. Creates songs and artists tables. If aws is True, writes the tables to the S3 bucket give in the output_data key arg: - spark: the spark session - input_data: the path to the folder containing the song_data - output_data: the path to the S3 bucket where to write the tables - aws: to set to true when the script is executed on the cluster. Set to False when executing locally for debugging """ # Define the song schema before importing data songsSchema = R([ StructField("num_songs",Int()), StructField("artist_id",Str()), StructField("artist_latitude",Dec()), StructField("artist_longitude",Dec()), StructField("artist_location",Str()), StructField("artist_name",Str()), StructField("song_id",Str()), StructField("title",Str()), StructField("duration",Dbl()), StructField("year",Int()), ]) # get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" # read song data file print("Reading song_data from {}\n".format(song_data)) df = spark.read.json(song_data,schema=songsSchema) # extract columns to create songs table print("Extracting columns to create the songs table...\n") df.createOrReplaceTempView("songs_data_table") songs_table = spark.sql(''' SELECT DISTINCT song_id, title, artist_id, year, duration FROM songs_data_table ''') print("done.") # print song table schema print("Songs table schema:\n") songs_table.printSchema() # write songs table to parquet files partitioned by year and artist if aws: print("Writing the songs table to parquet files partitioned by year and artist...\n") songs_table.write.parquet(output_data + "songs_table.parquet", partitionBy = ["year", "artist_id"], mode = "overwrite") print("done.") # extract columns to create artists table print("Extracting columns to create the artists table...\n") artists_table = spark.sql(''' SELECT DISTINCT artist_id, artist_name AS name, artist_location AS location, artist_latitude AS latitude, artist_longitude AS longitude FROM songs_data_table ''') print("done.") # print artists table schema print("Artists table schema:\n") artists_table.printSchema() # write artists table to parquet files if aws: print("Writing the artists table to parquet files ...\n") artists_table.write.parquet(output_data + "artists_table.parquet", mode = "overwrite") print("done.")
def process_log_data(spark, input_data, output_data, aws=True): """ Process log and song data by iterating over the - the .json files in the log_data folder - the .json files in the song_data folder. Creates the users, times and songplays tables. If aws is True, writes the tables to the S3 bucket give in the output_data key arg: - spark: the spark session - input_data: the path to the folder containing the song_data - output_data: the path to the S3 bucket where to write the tables - aws: to set to true when the script is executed on the cluster. Set to False when executing locally for debugging """ songsSchema = R([ StructField("num_songs",Int()), StructField("artist_id",Str()), StructField("artist_latitude",Dec()), StructField("artist_longitude",Dec()), StructField("artist_location",Str()), StructField("artist_name",Str()), StructField("song_id",Str()), StructField("title",Str()), StructField("duration",Dbl()), StructField("year",Int()), ]) # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file print("Reading log_data from {}\n".format(log_data)) df = spark.read.json(log_data) print("done.") # filter by actions for song plays print("Filter by actions for song plays...") df = df.filter(df.page=='NextSong') df.createOrReplaceTempView("logs_data_table") print("done.") # extract columns for users table print("Extract columns for users table...") users_table = spark.sql(''' SELECT DISTINCT userId as user_id, firstName as first_name, lastName as last_name, gender, level FROM logs_data_table ''') users_table = users_table.dropDuplicates(["user_id"]) print("done.") # write users table to parquet files if aws: print("Write users table to parquet files...") users_table.write.parquet(output_data + "users_table.parquet", mode = "overwrite") print("done.") # create datetime column from original timestamp column print("Create datetime column from original timestamp column...") get_datetime = udf(lambda time: datetime.fromtimestamp((time/1000.0)), Date()) df = df.withColumn("date",get_datetime("ts")) print("done.") # create timestamp column from original timestamp column print("Create timestamp column from original timestamp column...") convert_ts = udf(lambda time: datetime.fromtimestamp((time/1000.0)), TimestampType()) df = df.withColumn("ts",convert_ts("ts")) print("done.") # extract columns to create time table print("Extract columns to create time table...") df.createOrReplaceTempView("clean") time_table = spark.sql(''' SELECT ts AS start_time, date_format(date,'YYYY') AS year, date_format(date,'MM') AS month, date_format(date,'dd') AS day, date_format(date,'w') AS week, date_format(ts,'E') AS weekday, HOUR(ts) AS hour FROM clean ''').dropDuplicates(["start_time"]) print("done.") # write time table to parquet files partitioned by year and month if aws: print("Write time table to parquet files partitioned by year and month...") time_table.write.parquet(output_data + "songs_table.parquet", partitionBy = ["year", "month"], mode = "overwrite") print("done.") # read in song data to use for songplays table print("Read in song data to use for songplays table...") song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data, schema=songsSchema) song_df.createOrReplaceTempView("songs_data_table") print("done.") # extract columns from joined song and log datasets to create songplays table print("Extract columns from joined song and log datasets to create songplays table...") artists_table = spark.sql(''' SELECT DISTINCT artist_id, artist_name AS name, artist_location AS location, artist_latitude AS latitude, artist_longitude AS longitude FROM songs_data_table ''') artists_table.createOrReplaceTempView("artists") print("done.") print("Extract columns to create songplays table...") songplays_table = spark.sql(''' SELECT year(l.ts) AS year, month(l.ts) AS month, l.ts AS start_time, l.userId AS user_id, l.level, s.song_id, a.artist_id, l.sessionId AS session_id, l.location, l.userAgent AS user_agent FROM clean AS l JOIN songs_data_table AS s ON (l.song = s.title AND l.artist = s.artist_name) JOIN artists AS a ON a.artist_id=s.artist_id LIMIT 5 ''') print("done.") print("Create songplays_id...") songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) print("done.") # write songplays table to parquet files partitioned by year and month if aws: print("Write songplays table to parquet files partitioned by year and month...") songplays_table.write.parquet(output_data + "songs_table.parquet", partitionBy = ["year", "month"], mode = "overwrite") print("done.")
from pyspark.sql.types import LongType as Long, DecimalType as Dec, StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dec(10,5)), Fld("artist_location",Str()), Fld("artist_longitude", Dec(10,5)), Fld("artist_name",Str()), Fld("duration",Dec(10,5)), Fld("num_songs",Int()), Fld("song_id", Str()), Fld("title",Str()), Fld("year",Int()), ]) logSchema = R([ Fld("artist",Str()), Fld("auth", Str()), Fld("firstName",Str()), Fld("gender", Str()), Fld("itemInSession",Int()), Fld("lastName",Str()), Fld("length",Dec(10,5)), Fld("level", Str()), Fld("location",Str()), Fld("method",Str()), Fld("page",Str()), Fld("registration", Dbl()), Fld("sessionId",Int()), Fld("song", Str()), Fld("status",Str()), Fld("ts",Long()),
def demographics_to_parquet(spark, src, dest, joins): @udf('string') def gen_demo_id(race): if race == "Black or African-American": return "BAA" elif race == "Hispanic or Latino": return "HL" elif race == "White": return "W" elif race == "Asian": return "A" elif race == "American Indian and Alaska Native": return "AI" else: return "O" cSchema = StructType([ Fld("demographics_id", Str(), False), Fld("port_id", Str(), False), Fld("city", Str(), True), Fld("state", Str(), True), Fld("median_age", Dec(4, 1), True), Fld("male_population", Int(), True), Fld("female_population", Int(), True), Fld("total_population", Int(), True), Fld("avg_household_size", Dec(3, 2), True), Fld("foreign_born", Int(), True), Fld("race", Str(), True), Fld("race_code", Str(), True) ]) demographics_df = spark.read.csv(src, sep=";", header=True) port_df = spark.read.parquet(joins[0]) demographics_df = demographics_df.withColumn('race_code', gen_demo_id("Race")) joined_df = demographics_df.join(port_df, [ upper(demographics_df['City']) == upper(port_df['city']), trim(demographics_df['State Code']) == trim(port_df['state']) ]) joined_df = joined_df.select(demographics_df["*"], port_df['port_id']) joined_df = joined_df.withColumn( 'demographics_id', concat(col("port_id"), lit("-"), col("race_code"))) joined_df = joined_df.selectExpr( "demographics_id", "port_id", "city", "state", "`Median Age` as median_age", "`Male Population` as male_population", "`Female Population` as female_population", "`Total Population` as total_population", "`Average Household Size` as avg_household_size", "`Foreign-born` as foreign_born", "`Race` as race", "race_code") #apply typing for easy copy into redshift later joined_df = joined_df.withColumn("demographics_id", col("demographics_id").cast(Str())) joined_df = joined_df.withColumn("port_id", col("port_id").cast(Str())) joined_df = joined_df.withColumn("city", col("city").cast(Str())) joined_df = joined_df.withColumn("state", col("state").cast(Str())) joined_df = joined_df.withColumn("median_age", col("median_age").cast(Dec(4, 1))) joined_df = joined_df.withColumn("male_population", col("male_population").cast(Int())) joined_df = joined_df.withColumn("female_population", col("female_population").cast(Int())) joined_df = joined_df.withColumn("total_population", col("total_population").cast(Int())) joined_df = joined_df.withColumn( "avg_household_size", col("avg_household_size").cast(Dec(3, 2))) joined_df = joined_df.withColumn("foreign_born", col("foreign_born").cast(Int())) joined_df = joined_df.withColumn("race", col("race").cast(Str())) joined_df = joined_df.withColumn("race_code", col("race_code").cast(Str())) #fill nulls with 0 joined_df = joined_df.fillna(0) joined_df.write.mode('overwrite').parquet(dest)