def process_song_data(spark, input_data, output_data): """This function loads song_data from S3 and processes it by extracting the songs and artist tables and then again loaded back to S3 Args: spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession input_data (str): S3 bucket where song files are stored output (str): S3 bucket file path to store resulting files Returns: None """ print("**** Starting to process song data *****") # get filepath to song data file song_data = input_data+'song_data/*/*/*/*.json' # read song data file songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("song_id",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) try: df = spark.read.json(song_data, schema=songSchema) except Exception as e: print(e) # extract columns to create songs table songs_fields = ["song_id", "title", "artist_id", "year", "duration"] songs_table = df.select(songs_fields).dropDuplicates(["song_id"]) # write songs table to parquet files partitioned by year and artist try: songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite") except Exception as e: print(e) print("**** songs table data load is complete *****") # extract columns to create artists table artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"] artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"]) # write artists table to parquet files try: artists_table.write.parquet(output_data + "artists.parquet", mode="overwrite") except Exception as e: print(e) print("**** artists table data load is complete *****") print("**** song data processing is finished *****")
def process_song_data(spark, input_data, output_data): ''' Description: This function can be used to load the song data from the input S3 bucket and write the parquet files to the output S3 bucket. Arguments: spark: SparkSession input_data: location for the input data output_data: location for the output data Returns: None ''' # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") print(song_data) # read song data file songsSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) df = spark.read.json(song_data, schema=songsSchema).distinct() print(df.count()) print(df.show(5, truncate=False)) df.printSchema() # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct() songs_table.printSchema() songs_table.show(5) print('songs', songs_table.count()) # write songs table to parquet files partitioned by year and artist songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_data + "songs") # extract columns to create artists table df.createOrReplaceTempView("df") artists_table = spark.sql( "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df" ).distinct() artists_table.printSchema() artists_table.show(5) print('artists', artists_table.count()) # write artists table to parquet files artists_table.repartitionByRange( 3, "artist_id").write.mode('overwrite').parquet(output_data + "artists")
def process_song_data(spark, input_data, output_data): """ Process song_data json files which located in S3 Create table songs_table and artists_table Store the table in parque format in S3 Return the table to be used in process_log_data function Args: spark : Spark Session input_data (string) : location json files (input) output_data (string) : location parque files (output) Returns: songs_data (Spark Dataframe) : Song Data tables """ # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # set schema song data songSchema = StructType([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration") \ .where("song_id is not null") \ .dropDuplicates(['song_id']) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet( os.path.join(output_data, 'songs'), 'overwrite') # extract columns to create artists table artists_table = df.select(col("artist_id"), col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_latitude").alias("latitude"), col("artist_longitude").alias("longitude")) \ .where("artist_id is not null") \ .dropDuplicates(['artist_id']) # write artists table to parquet files artists_table.write.parquet(os.path.join(output_data, 'artists'), 'overwrite') # return song_data table to be used in process_log_data return df
def schema_song_data(): """ Description: Schema design for song datasets. """ try: print("schema_song_data fuction is statrting.") print("**************************************") schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()), ]) print("schema_song_data is successfull created") print("***************************************") return schema except: print("schema_song_data function is successful created.") print("************************************************")
def process_song_data(spark, input_data, output_data): """ Description: This function helps us to read the song_data from S3, put this data into spark dataframe ,extract columns from this dataframe to form "songinf table" and "artist table", transform "songinf table data" and "artist table data" into a format that this project needs. Parameters: -spark: spark session -input_data: location of song_data json file (in S3 bucket) -output_data: location that the final table will be saved (in S3 bucket) Return: None """ #--------------------read song data--------------------# print('Read song_data...') # get filepath to song data file song_data = os.path.join(input_data, "song_data_*.json") # define the song data schema for reading SongSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Doub()), Fld("artist_location",Str()), Fld("artist_longitude",Doub()), Fld("artist_name",Str()), Fld("duration",Doub()), Fld("num_songs",Long()), Fld("song_id",Str()), Fld("title",Str()), Fld("year",Long()) ]) # read song data file song_df = spark.read.json(song_data, schema=SongSchema) #--------------------deal with song table--------------------# # extract columns to create songinf df songinf_df = song_df.select(['song_id', 'title', 'artist_id', 'year', 'duration']) songinf_df = songinf_df.dropDuplicates(['song_id']) songinf_df = songinf_df.dropna(how = "any", subset = ["song_id"]) songinf_df = songinf_df.filter(songinf_df.song_id != "") print('Songs table: ') print(songinf_df.sort('song_id').show(5)) # write songs table to parquet files partitioned by year and artist print('Save Songs table into S3...') songinf_df.write.partitionBy("year", "artist_id").parquet("{}/song_table.parquet".format(output_data)) #--------------------deal with artists table--------------------# # extract columns to create artists df artist_df = song_df.select(['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']) artist_df = artist_df.dropDuplicates(['artist_id']) artist_df = artist_df.dropna(how = "any", subset = ["artist_id"]) artist_df = artist_df.filter(artist_df.artist_id != "") print('artists table: ') print(artist_df.sort('artist_id').show(5)) # write artists table to parquet files print('Save artists table into S3...') artist_df.write.parquet("{}/artist_table.parquet".format(output_data))
def process_song_data(spark, input_data, output_data): """ This function processes the song data of sparkify and creates facts/dimensions via spark and saves them to our data lake afterwards Arguments: spark {SparkSession}: Spark session to launch the program input_data {str}: location (local/s3) where the (root) input song data resides output_data {str}: location (local/s3) where the (root) output files should be written """ # get filepath to song data file # song_data = f"{input_data}song_data/A/A/A/*.json" song_data = f"{input_data}song_data/*/*/*/*.json" # read song data file songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Str()), Fld("artist_longitude", Str()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) # since schema can not infered automatically, we need to specify it beforehand df_song = spark.read.json(song_data, schema=songSchema) df_song.cache() # extract columns to create songs table songs_table = df_song.filter(df_song.song_id != '') \ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) \ .dropDuplicates(['song_id']) # write songs table to parquet files partitioned by year and artist output_song_data = f"{output_data}song_data/" songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_song_data) # extract columns to create artists table artists_table = df_song.filter(df_song.artist_id != '') \ .selectExpr(['artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude']) \ .dropDuplicates(['artist_id']) # write artists table to parquet files output_artist_data = f"{output_data}artist_data/" artists_table.write.mode('overwrite').parquet(output_artist_data)
def get_residence_cities(spark): cities = pd.read_csv('residence_city.txt', sep='=', names=['id', 'country']) cities['country'] = cities['country'].str.replace("'", '').str.strip() cities_data = cities.values.tolist() cities_schema = R([Fld('id', Str(), True), Fld('country', Str(), True)]) cities = spark.createDataFrame(cities_data, cities_schema) cities.write.mode('overwrite').parquet('resident_city.parquet') return cities
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) song_fields = ["title", "artist_id","year", "duration"] # extract columns to create songs table songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id()) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/') artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"] # extract columns to create artists table artists_table = df.selectExpr(artists_fields).dropDuplicates() # write artists table to parquet files artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ This function loads the songs JSON dataset from S3, then uses the data to create the songs and artists tables Input: spark = SparkSession object input_data = Start of path variable for input files output_data = Start of path variable for output files Output: None """ # get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # Define schema SongSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, schema=SongSchema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").dropduplicates() # write songs table to parquet files partitioned by year and artist output_path = os.path.join(output_data, 'songs_table.parquet') songs_table.write.partitionBy("year", "artist_id").parquet(output_path, mode="overwrite") # extract columns to create artists table artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude").dropduplicates() # write artists table to parquet files output_path = os.path.join(output_data, 'artists_table.parquet') artists_table.write.parquet(output_path, mode="overwrite") #export whole songs data file to parquet output_path = os.path.join(output_data, 'songs_data_table.parquet') df.write.parquet(output_path, mode="overwrite")
def process_song_data(spark, input_data, output_data): """ Processes song data and stores them as parquet files Loads song data into a spark DataFrame and transforms them into songs and artists DataFrames which are subsequently written as parquet files to songs and artists folders in the specified output path. Parameters: spark : SparkSession instance input_data (str) : Path of the directory of song_data output_data (str) : Path of the directory where the parquet files will be stored """ # specify schema for song data songs_schema = Struct([ Fld('num_songs', Int()), Fld('artist_id', Str()), Fld('artist_latitude', Double()), Fld('artist_longtitude', Double()), Fld('artist_location', Str()), Fld('artist_name', Str()), Fld('song_id', Str()), Fld('title', Str()), Fld('duration', Double()), Fld('year', Int()) ]) # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # read song data file df = spark.read.json(song_data, songs_schema) # extract columns to create songs table songs_table = df[['song_id', 'title', 'artist_id', 'year', 'duration']] \ .dropDuplicates(['song_id']) # write songs table to parquet files partitioned by year and artist songs_table.write.parquet(os.path.join(output_data, 'songs'), 'overwrite', partitionBy=['year', 'artist_id']) # extract columns to create artists table artists_table = df[[ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longtitude' ]].dropDuplicates(['artist_id']) # write artists table to parquet files artists_table.write.parquet(os.path.join(output_data, 'artists'), 'overwrite')
def process_song_data(spark, input_data, output_data): """Process song data, transform the data into songs and artists tables and store it in parquet files on S3. Parameters ---------- spark : SparkSession cursor to the sparkify database connection input_data : string input data prepend path output_data : string output data prepend path """ # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") song_schema = R([ Fld("num_songs", Int()), Fld("artist_id", Str(), False), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str(), False), Fld("song_id", Str(), False), Fld("title", Str(), False), Fld("duration", Dbl(), False), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, song_schema) # extract columns to create songs table songs_table = df.select( ["song_id", "title", "artist_id", "year", "duration"]) # write songs table to parquet files partitioned by year and artist songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \ .parquet(os.path.join(output_data, 'analytics/songs')) # extract columns to create artists table artists_table = df.select([ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ]) artists_table = artists_table.withColumnRenamed("artist_name", "name") \ .withColumnRenamed("artist_location", "location") \ .withColumnRenamed("artist_latitude", "latitude") \ .withColumnRenamed("artist_longitude", "longitude") # write artists table to parquet files artists_table.write.mode("overwrite") \ .parquet(os.path.join(output_data, 'analytics/artists'))
def process_song_data(spark, input_data, output_data): """import Song dataset extract columns and create songs and artist tables write those tables to parquet files Parameters: spark: name of spark session input_data: location of the source data s3 bucket output_data: location of the destination data s3 bucket Returns: writes songs table in parquet to output_data location + songs writes artist_table in parquet to output_dat location + artists """ # Setting up the JSON table structure for the Song dataset song_dataset_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Str()), ]) """get filepath to song data file use "song_data/*/*/*/*.json" for full dataset use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record """ song_data = input_data + "song_data/*/*/*/*.json" # read song data file with dataset_schema df = spark.read.json(song_data, schema=song_dataset_schema) # extract columns to create songs table songs_table = df.select('song_id', 'artist_id', 'year', 'duration') # drop duplicate rows in songs table songs_table = songs_table.dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.mode('append').partitionBy( 'year', 'artist_id').parquet(output_data + "songs") # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') # drop duplicate rows in artists table artists_table = artists_table.dropDuplicates() # write artists table to parquet files artists_table.write.mode('append').parquet(output_data + "artists")
def process_song_data(spark, input_data, output_data): print('%%%%% Starting up the SONG data process') # get filepath to song data file song_data = 'song_data/A/*/*/*.json' # setting up the schema for the data that we're about to pull songSchema = ST([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file - Uncomment the line below to download from S3, otherwise line 30 will access the data locally #df = spark.read.json(input_data + song_data) raw_song_df = spark.read.json(input_data + song_data, songSchema) # extract columns to create songs table songs_table = raw_song_df.select(raw_song_df.song_id, \ raw_song_df.title, \ raw_song_df.artist_id, \ raw_song_df.year.cast(Int()), \ raw_song_df.duration.cast(Dbl())) # write songs table to parquet files partitioned by year and artist songs_table.write.mode('overwrite').partitionBy( 'year', 'artist_id').parquet(output_data + 'songs') print('%%%%% Songs table has been created and written to the S3 Bucket') # extract columns to create artists table artists_table = raw_song_df.select(raw_song_df.artist_id , \ raw_song_df.artist_latitude.alias('latitude'), \ raw_song_df.artist_location.alias('location'), \ raw_song_df.artist_longitude.alias('longitude'), \ raw_song_df.artist_name.alias('name')).dropDuplicates(['artist_id','name']) # write artists table to parquet files artists_table.write.mode('overwrite').parquet(output_data + 'artist') print('%%%%% Artists table has been created and written to the S3 Bucket') print('%%%%% SONG data has been completed and returning the raw_song_df') return raw_song_df
def process_song_data(spark, input_data, output_data): """ The function loads data from song_data dataset and extract columns for songs and artist tables and write the data into parquet files which will be loaded on s3. """ song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) # get filepath to song data file song_data = 'song_data/*/*/*/*.json' # read song data file df = spark.read.json(os.path.join(input_data, song_data), schema=song_schema) # extract columns to create songs table songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration').dropDuplicates() songs_table.createOrReplaceTempView('songs') # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet( os.path.join(output_data, 'songs/songs.parquet'), 'overwrite') # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') \ .withColumnRenamed('artist_name', 'name') \ .withColumnRenamed('artist_location', 'location') \ .withColumnRenamed('artist_latitude', 'latitude') \ .withColumnRenamed('artist_longitude', 'longitude') \ .dropDuplicates() artists_table.createOrReplaceTempView('artists') # write artists table to parquet files artists_table.write.parquet( os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
def process_song_data(spark, input_data, output_data): """ Method to process song data and create tables: songs, artists :param spark: Spark session :param input_data: S3 bucket :param output_data: S3 bucket :return: Data frame of song data """ # get filepath to song data file song_data = input_data + '/song-data/A/A/B/*.json' songs_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) # read song data file print('Reading song data.') df = spark.read.json(song_data, schema=songs_schema) song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration'] # extract columns to create songs table songs_table = df.selectExpr(song_columns).dropDuplicates() # write songs table to parquet files partitioned by year and artist print('Writing songs to parquet.') write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id') artist_columns = [ 'artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude' ] # extract columns to create artists table artists_table = df.selectExpr(artist_columns).dropDuplicates() # write artists table to parquet files print('Writing artists to parquet.') write_parquet(artists_table, output_data, 'artists', None, None) return df
def process_song_data(spark, input_data, output_data): ''' load song data in json format from S3 bucket and process these data by extracting songs table and artists table, and save these tables back to S3 bucket :param spark: spark session :param input_data: data location for input data :param output_data: data location for output data :return: no return value ''' # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # create songs schema songSchema = R([ Fld('artist_id', Str()), Fld('artist_latitude', Dbl()), Fld('artist_location', Str()), Fld('artist_longitude', Dbl()), Fld('artist_name', Str()), Fld('duration', Dbl()), Fld('num_songs', Int()), Fld('title', Str()), Fld('year', Int()), ]) # load songs json files from S3 df_songs = spark.read.json(song_data, schema=songSchema) # select columns for songs_table songs_attr = ['title', 'artist_id', 'year', 'duration'] songs_table = df_songs.select(songs_attr)\ .dropDuplicates()\ .withColumn('song_id', monotonically_increasing_id()) # write songs_table to S3 songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') # select artists columns artists_attr = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ] artists_table = df_songs.select(artists_attr)\ .dropDuplicates() artists_table = artists_table\ .withColumnRenamed('artist_name','name')\ .withColumnRenamed('artist_location','location')\ .withColumnRenamed('artist_latitude','latitude')\ .withColumnRenamed('artist_longitude','longitude') # write artists_table to S3 artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ process_song_data - Loads the song data files from S3, and saves the song information to a parquet file (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file. """ # Get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json') songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # Read song data file df = spark.read.json(song_data, schema=songSchema) # Extract columns to create songs table songs_table = df.select( ['song_id', 'title', 'artist_id', 'year', 'duration']) # Write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").mode('overwrite').parquet( os.path.join(output_data, 'songs.parquet')) # Extract columns to create artists table, and find the distinct artists artists_table = df.select([ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ]).withColumnRenamed('artist_name', 'name').withColumnRenamed( 'artist_location', 'location').withColumnRenamed( 'artist_latitude', 'latitude').withColumnRenamed('artist_longitude', 'longitude').distinct() # Write artists table to parquet files artists_table.write.mode('overwrite').parquet( os.path.join(output_data, 'artists.parquet'))
def process_song_data(spark, input_data_songs, output_data): """ Read song data by providing it an expected schema. Create songs and artists tables. """ # define song data schema to improve performance song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) song_data = input_data_songs t_start = time() dfSongs = spark.read.json(song_data, schema=song_schema) t_end = time() - t_start print('Read song data in {} secs'.format(t_end)) dfSongs.printSchema() dfSongs.count() dfSongs.show(5) songs_table = dfSongs.filter(dfSongs.song_id != '')\ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) songs_table.show(5) songs_table.write.partitionBy( "year", "artist_id").mode('overwrite').parquet(output_data + 'songs/songs_table.parquet') artists_table = dfSongs.filter(dfSongs.artist_id !='') \ .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\ .dropDuplicates() artists_table.show(5) artists_table.write.mode('overwrite').parquet( output_data + 'artists/artists_table.parquet')
def immigration_etl(source="s3://immigration-us-1/sas_data", destination="s3://immigration-us-1/sas_data_ready_to_use", country_dict="s3://immigration-us-1/raw_data/country_dict.csv", visa_dict="s3://immigration-us-1/raw_data/visa_dict.csv" ): # Local udfs spaceDeleteUDF = udf(lambda s: s.replace(" ", ""), Str()) ampesandDeleteUDF = udf(lambda s: s.replace("'", ""), Str()) udf_to_datetime_sas = udf(lambda x: date_converter(x), DateType()) Schema_country = R([ Fld("id",Str()), Fld("country",Str()) ]) Schema_Visa = R([ Fld("id",Str()), Fld("Visa_Type",Str()) ]) df_spark=spark.read.parquet(source) # Only immigrants from air immigrants=df_spark.where(F.col("i94mode")==1) immigrants=immigrants.select("cicid", "i94yr", "i94mon", "i94cit", "i94res", "i94port", "arrdate", "i94visa", "biryear", "gender", "visatype", "airline") country_dict=spark.read.csv(country_dict, header=True, mode="DROPMALFORMED", sep="=",schema=Schema_country) immigrants = immigrants.join(country_dict, immigrants.i94cit == country_dict.id,how='right') immigrants=immigrants.withColumnRenamed("country", "cit_country") immigrants=immigrants.drop('id','i94cit') immigrants = immigrants.join(country_dict, immigrants.i94res == country_dict.id,how='right') immigrants=immigrants.withColumnRenamed("country", "res_country") immigrants=immigrants.drop('id','i94res') visa_dict=spark.read.csv(visa_dict, header=False, mode="DROPMALFORMED",sep="=", schema=Schema_Visa) immigrants = immigrants.join(visa_dict, immigrants.i94visa == visa_dict.id,how='right') immigrants=immigrants.drop('id','i94visa') immigrants=immigrants.withColumn("arrdate", udf_to_datetime_sas("arrdate")) immigrants = immigrants.withColumn("cicid", immigrants["cicid"].cast(IntegerType())) immigrants = immigrants.withColumn("i94yr", immigrants["i94yr"].cast(IntegerType())) immigrants = immigrants.withColumn("biryear", immigrants["biryear"].cast(IntegerType())) immigrants = immigrants.withColumn("i94mon", immigrants["i94mon"].cast(IntegerType())) immigrants.show(10) immigrants.write.parquet(destination)
def process_song_data(spark, input_data, output_data): """ Reads from song files, transforms them into songs and artists data, and writes them in parquet format. params: - spark: spark session object - input_data: input data path - output_data: output data path """ # get filepath to song data file song_data = input_data + "/song_data/*/*/*/*.json" # use schema when read json files song_schema = St([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, schema=song_schema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs", mode="overwrite", \ partitionBy=["year", "artist_id"]) # extract columns to create artists table artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", \ "artist_latitude as latitude", "artist_longitude as longitude") \ .dropDuplicates() # write artists table to parquet files artists_table.write.parquet(output_data + "artists", mode="overwrite")
def process_song_data(spark, input_data, output_data): """ Process the songs data from S3 storage and create the analytical tables, songs table and artists table. This function read the data in json files from the S3 storage, transforme the data into tha analytcal tables (songs and artists), and write it into partitioned parquet files on S3. Args: spark: the spark session input_data: the S3 bucket to read data from output_data: the S3 bucket to write analytics tables to """ # get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" # defined the song data schema song_data_schema = R([ Fld("artist_id", Str(), False), Fld("artist_latitude", Str(), True), Fld("artist_longitude", Str(), True), Fld("artist_location", Str(), True), Fld("artist_name", Str(), False), Fld("song_id", Str(), False), Fld("title", Str(), False), Fld("duration", Dbl(), False), Fld("year", Int(), False) ]) # read song data file df = spark.read.json(song_data, schema=song_data_schema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct() # write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs_table.parquet", mode="overwrite", partitionBy=["year", "artist_id"]) # extract columns to create artists table artists_table = df.select( "artist_id", col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_latitude").alias("latitude"), col("artist_longitude").alias("longitude"), ).distinct() # write artists table to parquet files artists_table.write.parquet(output_data + "artists_table.parquet", mode="overwrite")
def get_song_src_schema(): """ Get the source spark schema definition :return: The schema definition """ return R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ])
def process_song_data(spark, input_data, output_data): """ Extract data from song_data and write songs and artists table Arguments: - spark : SparkSession object - input_data : input data root dir path - output_data : output data root dir path """ # schema for song_data songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year",Int()) ]) # get filepath to song data file song_data = input_data + "song_data/*/*/*" # read song data file df = spark.read.json(song_data, schema=songSchema).dropDuplicates(["song_id"]) # extract columns to create songs table df.createOrReplaceTempView("song_data") songs_table = spark.sql(""" SELECT song_id, title, artist_id, year, duration FROM song_data """) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs_table", 'overwrite') # extract columns to create artists table artists_table = spark.sql(""" SELECT artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude FROM song_data """).dropDuplicates(["artist_id"]) # write artists table to parquet files artists_table.write.parquet(output_data + "artists_table")
def process_song_data(spark, input_data, output_data): """ Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables and then loaded the processed data back to S3 (output_data) :param spark: Spark Session object :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format """ # Get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # Read song data file print("Reading song_data JSON files from S3") df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \ columnNameOfCorruptRecord='corrupt_record').dropDuplicates() print("Read completed") # Extract columns to create songs table songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \ .withColumn("song_id", monotonically_increasing_id()) print("Writing Songs table to S3 after processing") # Write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs/", mode="overwrite", partitionBy=["year", "artist_id"]) print("Completed") # Extract columns to create artists table artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \ .dropDuplicates() print("Writing Artists table to S3 after processing") # Write artists table to parquet files artists_table.write.parquet(output_data + "artists/", mode="overwrite") print("Completed")
def process_song_data(spark, input_data, output_data): """ read song data from s3 and then create the songs_table and artists_table. load them back to s3. parameters: spark: spark session input_data: path of song data output_data: path of output table """ # get filepath to song data file # song_data = input_data + "song_data/*/*/*/*.json" song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json" # create song table schema songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0 song_field = ["title", "duration", "year", "artist_id"] songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\ .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull()) # extract columns to create artists table, drop if artist_id and name containing any null values artist_field = [ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ] artists_table = df.select(artist_field).dropDuplicates().dropna( subset=["artist_id", "artist_name"]) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs/", mode="overwrite") # write artists table to parquet files artists_table.write.parquet(output_data + "artists/", mode="overwrite")
def create_song_schema(): """ Schema structure for song data :return: StructType """ song_schema = R([ Fld("num_songs", In()), Fld("artist_id", St()), Fld("artist_latitude", Fl()), Fld("artist_longitude", Fl()), Fld("artist_location", St()), Fld("artist_name", St()), Fld("song_id", St()), Fld("title", St()), Fld("duration", Fl()), Fld("year", SInt()) ]) return song_schema
def process_song_data(spark, input_data, output_data): """ Imports and process data from song dataset and then write data to parquet files on Amazon S3 Parameters: spark: spark session input_data: S3 bucket path to input data from. output_data: another S3 bucket path to write data to it. """ # get filepath to song data file song_data = input_data + "song-data/*/*/*/*.json" SongSchema = ST([ Fld("song_id", Str()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=SongSchema).dropDuplicates( ['song_id', 'artist_id']) # extract columns to create songs table songs_table = df.select('song_id', 'artist_id', 'year', 'duration') # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + "songs") # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') # write artists table to parquet files artists_table.write.parquet(output_data + "artists")
def process_song_data(spark, input_data, output_data): """ Description: This function fetches song_data from S3 into a staging dataframe, then extracts the songs and artist tables, and eventually exports data back to S3 Parameters: spark : object for Spark Session input_data : location of song_data output_data : location of target S3 bucket """ # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # define schema songdata_schema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) # read song data file df = spark.read.json(song_data, schema=songdata_schema) # extract columns to create songs table songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration']) songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\ select(['song_id', 'artist_name', 'artist_id', 'year', 'duration']) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') # extract columns to create artists table selection = ['artist_id', 'artist_name as name', \ 'artist_location as location', 'artist_latitude as latitude', \ 'artist_longitude as longitude'] artists_table = df.selectExpr(selection).dropDuplicates() # write artists table to parquet files artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """Read song data from source json files ,extract songs and artist tables then store the in parqute files in the target location Parameters: spark: spark session input_data: source of songs json files output_data: target to store extracted tables in as parquet files. """ # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # create song data schema from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date,TimestampType as Ts songSchema = R([ Fld("song_id",Str()), Fld("title",Str()), Fld("duration",Dbl()), Fld("year",Int()), Fld("artist_id",Str()), Fld("artist_name",Str()), Fld("artist_latitude",Str()), Fld("artist_longitude",Dbl()), Fld("artist_location",Dbl()), Fld("num_songs",Int()), ]) # read song data file df = spark.read.json(song_data,schema=songSchema) # define fields to be created in the extracted songs_table songs_table_fields =["song_id","title","artist_id","year","duration"] # extract columns to create songs table songs_table = df.select(songs_table_fields).dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year","artist_id").parquet(output_data + 'songs/') # define artist table fields artists_table_fields = ["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"] # extract columns to create artists table artists_table = df.select(artists_table_fields).dropDuplicates(); # write artists table to parquet files artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + "./data/song_data/*/*/*/*.json" """Creating the song_data file schema that we are going to add to spark""" songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # reading song data file json structure df = spark.read.json(song_data, schema=songSchema) """Filtering out only the needed columns for the songs table""" song_fields = ["title", "artist_id", "year", "duration"] print('Creating the songs table and dropping duplicates') songs_table = df.select(song_fields).dropDuplicates().withColumn( "song_id", monotonically_increasing_id()) print( "--- All duplicate songs have been dropped and the songs table created ---" ) print('Printing some rows from the songs_table') songs_table.show(15) print('Saving the songs table to the s3 bucket') songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + "songs") print("--- songs.parquet completed ---") """Filtering out only the needed columns for the artists table""" artists_data = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ] print("--- Starting to drop duplicate artists....") artists_table = df.selectExpr(artists_data).dropDuplicates() print("All duplicate artists have been dropped......") print('Printing some rows from the artists_table') artists_table.show(15) """writing the artists table to the parquets file""" artists_table.write.parquet(output_data + "artists") print("--- artists.parquet completed ---") print("*** process_song_data completed ***\n\n")