def schema_song_data(): """ Description: Schema design for song datasets. """ try: print("schema_song_data fuction is statrting.") print("**************************************") schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()), ]) print("schema_song_data is successfull created") print("***************************************") return schema except: print("schema_song_data function is successful created.") print("************************************************")
def process_song_data(spark, input_data, output_data): ''' Description: This function can be used to load the song data from the input S3 bucket and write the parquet files to the output S3 bucket. Arguments: spark: SparkSession input_data: location for the input data output_data: location for the output data Returns: None ''' # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") print(song_data) # read song data file songsSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) df = spark.read.json(song_data, schema=songsSchema).distinct() print(df.count()) print(df.show(5, truncate=False)) df.printSchema() # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct() songs_table.printSchema() songs_table.show(5) print('songs', songs_table.count()) # write songs table to parquet files partitioned by year and artist songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_data + "songs") # extract columns to create artists table df.createOrReplaceTempView("df") artists_table = spark.sql( "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df" ).distinct() artists_table.printSchema() artists_table.show(5) print('artists', artists_table.count()) # write artists table to parquet files artists_table.repartitionByRange( 3, "artist_id").write.mode('overwrite').parquet(output_data + "artists")
def get_log_src_schema(): """ Get the source spark schema definition :return: The schema definition """ return R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Int()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Str()), Fld("sessionId", Int()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Str()), Fld("userAgent", Str()), Fld("userId", Str()) ])
def process_song_data(spark, input_data, output_data): """This function loads song_data from S3 and processes it by extracting the songs and artist tables and then again loaded back to S3 Args: spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession input_data (str): S3 bucket where song files are stored output (str): S3 bucket file path to store resulting files Returns: None """ print("**** Starting to process song data *****") # get filepath to song data file song_data = input_data+'song_data/*/*/*/*.json' # read song data file songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("song_id",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) try: df = spark.read.json(song_data, schema=songSchema) except Exception as e: print(e) # extract columns to create songs table songs_fields = ["song_id", "title", "artist_id", "year", "duration"] songs_table = df.select(songs_fields).dropDuplicates(["song_id"]) # write songs table to parquet files partitioned by year and artist try: songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite") except Exception as e: print(e) print("**** songs table data load is complete *****") # extract columns to create artists table artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"] artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"]) # write artists table to parquet files try: artists_table.write.parquet(output_data + "artists.parquet", mode="overwrite") except Exception as e: print(e) print("**** artists table data load is complete *****") print("**** song data processing is finished *****")
def read_and_process_airport_data(spark, filename, df_dimension_state_table): """ Load the airport codes join with state dimension data to get airports with state_key""" logging.info("Reading airport data") # load the airport codes so we can map them to states airport_schema = R([ Fld("ident", Str()), Fld("type", Str()), Fld("name", Str()), Fld("elevation_ft", Int()), Fld("continent", Str()), Fld("iso_country", Str()), Fld("iso_region", Str()), Fld("municipality", Str()), Fld("gps_code", Str()), Fld("iata_code", Str()), Fld("local_code", Str()), Fld("coordinates", Str()) ]) df_airport = spark.read.options(Header=True, Delimter=",").csv(filename, airport_schema) # cleanse: we only want the airports in the US which map to the states that we have in the states table df_airport = df_airport.filter(df_airport.iso_country == "US") \ .join(df_dimension_state_table, F.substring(df_airport.iso_region, 4, 2) == df_dimension_state_table.state_key, "inner") \ .select(df_airport.ident, df_airport.local_code, df_dimension_state_table.state_key) return df_airport
def create_log_schema(): """ Schema structure for log data :return: StructType """ log_schema = R([ Fld('artist', St()), Fld('auth', St()), Fld('firstName', St()), Fld('gender', St()), Fld('itemInSession', LInt()), Fld('lastName', St()), Fld('length', Fl()), Fld('level', St()), Fld('location', St()), Fld('method', St()), Fld('page', St()), Fld('registration', Dbl()), Fld('sessionId', LInt()), Fld('song', St()), Fld('status', LInt()), Fld('ts', LInt()), Fld('userAgent', St()), Fld('userId', St()) ]) return log_schema
def get_log_schema(): """ Creates a schema for log data. :return: schema """ log_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Str()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Str()), Fld("song", Str()), Fld("status", Str()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) return log_schema
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) song_fields = ["title", "artist_id","year", "duration"] # extract columns to create songs table songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id()) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/') artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"] # extract columns to create artists table artists_table = df.selectExpr(artists_fields).dropDuplicates() # write artists table to parquet files artists_table.write.parquet(output_data + 'artists/')
def read_city_demographic_data(spark, filename): """ read named city data file into spark dataframe """ logging.info("Reading demographic data: {}".format(filename)) demographic_schema = R([ Fld("City", Str()), Fld("State", Str()), Fld("Median Age", Ft()), Fld("Male Population", Int()), Fld("Female Population", Int()), Fld("Total Population", Int()), Fld("Number of Veterans", Int()), Fld("Foreign Born", Int()), Fld("Average Household Size", Ft()), Fld("State Code", Str()), Fld("Race", Str()), Fld("Count", Int()) ]) try: df_demographic = spark.read.options(Header=True, Delimiter=";").csv( filename, demographic_schema) except EOFError as ex: logging.exception("End of file exception:") print(ex) except FileNotFoundError as ex: logging.exception("File not found exception:") print(ex) return df_demographic
def immigration_etl(source="s3://immigration-us-1/sas_data", destination="s3://immigration-us-1/sas_data_ready_to_use", country_dict="s3://immigration-us-1/raw_data/country_dict.csv", visa_dict="s3://immigration-us-1/raw_data/visa_dict.csv" ): # Local udfs spaceDeleteUDF = udf(lambda s: s.replace(" ", ""), Str()) ampesandDeleteUDF = udf(lambda s: s.replace("'", ""), Str()) udf_to_datetime_sas = udf(lambda x: date_converter(x), DateType()) Schema_country = R([ Fld("id",Str()), Fld("country",Str()) ]) Schema_Visa = R([ Fld("id",Str()), Fld("Visa_Type",Str()) ]) df_spark=spark.read.parquet(source) # Only immigrants from air immigrants=df_spark.where(F.col("i94mode")==1) immigrants=immigrants.select("cicid", "i94yr", "i94mon", "i94cit", "i94res", "i94port", "arrdate", "i94visa", "biryear", "gender", "visatype", "airline") country_dict=spark.read.csv(country_dict, header=True, mode="DROPMALFORMED", sep="=",schema=Schema_country) immigrants = immigrants.join(country_dict, immigrants.i94cit == country_dict.id,how='right') immigrants=immigrants.withColumnRenamed("country", "cit_country") immigrants=immigrants.drop('id','i94cit') immigrants = immigrants.join(country_dict, immigrants.i94res == country_dict.id,how='right') immigrants=immigrants.withColumnRenamed("country", "res_country") immigrants=immigrants.drop('id','i94res') visa_dict=spark.read.csv(visa_dict, header=False, mode="DROPMALFORMED",sep="=", schema=Schema_Visa) immigrants = immigrants.join(visa_dict, immigrants.i94visa == visa_dict.id,how='right') immigrants=immigrants.drop('id','i94visa') immigrants=immigrants.withColumn("arrdate", udf_to_datetime_sas("arrdate")) immigrants = immigrants.withColumn("cicid", immigrants["cicid"].cast(IntegerType())) immigrants = immigrants.withColumn("i94yr", immigrants["i94yr"].cast(IntegerType())) immigrants = immigrants.withColumn("biryear", immigrants["biryear"].cast(IntegerType())) immigrants = immigrants.withColumn("i94mon", immigrants["i94mon"].cast(IntegerType())) immigrants.show(10) immigrants.write.parquet(destination)
def process_song_data(spark, input_data, output_data): """ Description: This function helps us to read the song_data from S3, put this data into spark dataframe ,extract columns from this dataframe to form "songinf table" and "artist table", transform "songinf table data" and "artist table data" into a format that this project needs. Parameters: -spark: spark session -input_data: location of song_data json file (in S3 bucket) -output_data: location that the final table will be saved (in S3 bucket) Return: None """ #--------------------read song data--------------------# print('Read song_data...') # get filepath to song data file song_data = os.path.join(input_data, "song_data_*.json") # define the song data schema for reading SongSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Doub()), Fld("artist_location",Str()), Fld("artist_longitude",Doub()), Fld("artist_name",Str()), Fld("duration",Doub()), Fld("num_songs",Long()), Fld("song_id",Str()), Fld("title",Str()), Fld("year",Long()) ]) # read song data file song_df = spark.read.json(song_data, schema=SongSchema) #--------------------deal with song table--------------------# # extract columns to create songinf df songinf_df = song_df.select(['song_id', 'title', 'artist_id', 'year', 'duration']) songinf_df = songinf_df.dropDuplicates(['song_id']) songinf_df = songinf_df.dropna(how = "any", subset = ["song_id"]) songinf_df = songinf_df.filter(songinf_df.song_id != "") print('Songs table: ') print(songinf_df.sort('song_id').show(5)) # write songs table to parquet files partitioned by year and artist print('Save Songs table into S3...') songinf_df.write.partitionBy("year", "artist_id").parquet("{}/song_table.parquet".format(output_data)) #--------------------deal with artists table--------------------# # extract columns to create artists df artist_df = song_df.select(['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']) artist_df = artist_df.dropDuplicates(['artist_id']) artist_df = artist_df.dropna(how = "any", subset = ["artist_id"]) artist_df = artist_df.filter(artist_df.artist_id != "") print('artists table: ') print(artist_df.sort('artist_id').show(5)) # write artists table to parquet files print('Save artists table into S3...') artist_df.write.parquet("{}/artist_table.parquet".format(output_data))
def set_data_schema(schema): """ This Function return the schema definition Param: schema name Output: schema """ print("...Setting data schema") try: songtSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Str()), Fld("artist_longitude", Str()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) eventSchema = R([ Fld("artists", Str()), Fld("auth", Str()), Fld("first_name", Str()), Fld("gender", Str()), Fld("item_in_session", Int()), Fld("last_name", Str()), Fld("lenght", Str()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Str()), Fld("session_id", Int()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Str()), Fld("user_agente", Str()), Fld("user_id", Int()) ]) create_schema = {'Song': songtSchema, 'Event': eventSchema} return create_schema[schema] except: return "Error schema not exists"
def process_song_data(spark, input_data, output_data): """ This function processes the song data of sparkify and creates facts/dimensions via spark and saves them to our data lake afterwards Arguments: spark {SparkSession}: Spark session to launch the program input_data {str}: location (local/s3) where the (root) input song data resides output_data {str}: location (local/s3) where the (root) output files should be written """ # get filepath to song data file # song_data = f"{input_data}song_data/A/A/A/*.json" song_data = f"{input_data}song_data/*/*/*/*.json" # read song data file songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Str()), Fld("artist_longitude", Str()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) # since schema can not infered automatically, we need to specify it beforehand df_song = spark.read.json(song_data, schema=songSchema) df_song.cache() # extract columns to create songs table songs_table = df_song.filter(df_song.song_id != '') \ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) \ .dropDuplicates(['song_id']) # write songs table to parquet files partitioned by year and artist output_song_data = f"{output_data}song_data/" songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_song_data) # extract columns to create artists table artists_table = df_song.filter(df_song.artist_id != '') \ .selectExpr(['artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude']) \ .dropDuplicates(['artist_id']) # write artists table to parquet files output_artist_data = f"{output_data}artist_data/" artists_table.write.mode('overwrite').parquet(output_artist_data)
def process_song_data(spark, input_data, output_data): """import Song dataset extract columns and create songs and artist tables write those tables to parquet files Parameters: spark: name of spark session input_data: location of the source data s3 bucket output_data: location of the destination data s3 bucket Returns: writes songs table in parquet to output_data location + songs writes artist_table in parquet to output_dat location + artists """ # Setting up the JSON table structure for the Song dataset song_dataset_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Str()), ]) """get filepath to song data file use "song_data/*/*/*/*.json" for full dataset use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record """ song_data = input_data + "song_data/*/*/*/*.json" # read song data file with dataset_schema df = spark.read.json(song_data, schema=song_dataset_schema) # extract columns to create songs table songs_table = df.select('song_id', 'artist_id', 'year', 'duration') # drop duplicate rows in songs table songs_table = songs_table.dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.mode('append').partitionBy( 'year', 'artist_id').parquet(output_data + "songs") # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') # drop duplicate rows in artists table artists_table = artists_table.dropDuplicates() # write artists table to parquet files artists_table.write.mode('append').parquet(output_data + "artists")
def get_residence_cities(spark): cities = pd.read_csv('residence_city.txt', sep='=', names=['id', 'country']) cities['country'] = cities['country'].str.replace("'", '').str.strip() cities_data = cities.values.tolist() cities_schema = R([Fld('id', Str(), True), Fld('country', Str(), True)]) cities = spark.createDataFrame(cities_data, cities_schema) cities.write.mode('overwrite').parquet('resident_city.parquet') return cities
def process_song_data(spark, input_data, output_data): """ This function loads the songs JSON dataset from S3, then uses the data to create the songs and artists tables Input: spark = SparkSession object input_data = Start of path variable for input files output_data = Start of path variable for output files Output: None """ # get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # Define schema SongSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, schema=SongSchema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").dropduplicates() # write songs table to parquet files partitioned by year and artist output_path = os.path.join(output_data, 'songs_table.parquet') songs_table.write.partitionBy("year", "artist_id").parquet(output_path, mode="overwrite") # extract columns to create artists table artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude").dropduplicates() # write artists table to parquet files output_path = os.path.join(output_data, 'artists_table.parquet') artists_table.write.parquet(output_path, mode="overwrite") #export whole songs data file to parquet output_path = os.path.join(output_data, 'songs_data_table.parquet') df.write.parquet(output_path, mode="overwrite")
def process_song_data(spark, input_data, output_data): ''' load song data in json format from S3 bucket and process these data by extracting songs table and artists table, and save these tables back to S3 bucket :param spark: spark session :param input_data: data location for input data :param output_data: data location for output data :return: no return value ''' # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # create songs schema songSchema = R([ Fld('artist_id', Str()), Fld('artist_latitude', Dbl()), Fld('artist_location', Str()), Fld('artist_longitude', Dbl()), Fld('artist_name', Str()), Fld('duration', Dbl()), Fld('num_songs', Int()), Fld('title', Str()), Fld('year', Int()), ]) # load songs json files from S3 df_songs = spark.read.json(song_data, schema=songSchema) # select columns for songs_table songs_attr = ['title', 'artist_id', 'year', 'duration'] songs_table = df_songs.select(songs_attr)\ .dropDuplicates()\ .withColumn('song_id', monotonically_increasing_id()) # write songs_table to S3 songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') # select artists columns artists_attr = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ] artists_table = df_songs.select(artists_attr)\ .dropDuplicates() artists_table = artists_table\ .withColumnRenamed('artist_name','name')\ .withColumnRenamed('artist_location','location')\ .withColumnRenamed('artist_latitude','latitude')\ .withColumnRenamed('artist_longitude','longitude') # write artists_table to S3 artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """Process song data, transform the data into songs and artists tables and store it in parquet files on S3. Parameters ---------- spark : SparkSession cursor to the sparkify database connection input_data : string input data prepend path output_data : string output data prepend path """ # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") song_schema = R([ Fld("num_songs", Int()), Fld("artist_id", Str(), False), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str(), False), Fld("song_id", Str(), False), Fld("title", Str(), False), Fld("duration", Dbl(), False), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, song_schema) # extract columns to create songs table songs_table = df.select( ["song_id", "title", "artist_id", "year", "duration"]) # write songs table to parquet files partitioned by year and artist songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \ .parquet(os.path.join(output_data, 'analytics/songs')) # extract columns to create artists table artists_table = df.select([ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ]) artists_table = artists_table.withColumnRenamed("artist_name", "name") \ .withColumnRenamed("artist_location", "location") \ .withColumnRenamed("artist_latitude", "latitude") \ .withColumnRenamed("artist_longitude", "longitude") # write artists table to parquet files artists_table.write.mode("overwrite") \ .parquet(os.path.join(output_data, 'analytics/artists'))
def process_song_data(spark, input_data, output_data): """ Process the songs data from S3 storage and create the analytical tables, songs table and artists table. This function read the data in json files from the S3 storage, transforme the data into tha analytcal tables (songs and artists), and write it into partitioned parquet files on S3. Args: spark: the spark session input_data: the S3 bucket to read data from output_data: the S3 bucket to write analytics tables to """ # get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" # defined the song data schema song_data_schema = R([ Fld("artist_id", Str(), False), Fld("artist_latitude", Str(), True), Fld("artist_longitude", Str(), True), Fld("artist_location", Str(), True), Fld("artist_name", Str(), False), Fld("song_id", Str(), False), Fld("title", Str(), False), Fld("duration", Dbl(), False), Fld("year", Int(), False) ]) # read song data file df = spark.read.json(song_data, schema=song_data_schema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct() # write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs_table.parquet", mode="overwrite", partitionBy=["year", "artist_id"]) # extract columns to create artists table artists_table = df.select( "artist_id", col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_latitude").alias("latitude"), col("artist_longitude").alias("longitude"), ).distinct() # write artists table to parquet files artists_table.write.parquet(output_data + "artists_table.parquet", mode="overwrite")
def process_song_data(spark, input_data, output_data): """ The function loads data from song_data dataset and extract columns for songs and artist tables and write the data into parquet files which will be loaded on s3. """ song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) # get filepath to song data file song_data = 'song_data/*/*/*/*.json' # read song data file df = spark.read.json(os.path.join(input_data, song_data), schema=song_schema) # extract columns to create songs table songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration').dropDuplicates() songs_table.createOrReplaceTempView('songs') # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet( os.path.join(output_data, 'songs/songs.parquet'), 'overwrite') # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') \ .withColumnRenamed('artist_name', 'name') \ .withColumnRenamed('artist_location', 'location') \ .withColumnRenamed('artist_latitude', 'latitude') \ .withColumnRenamed('artist_longitude', 'longitude') \ .dropDuplicates() artists_table.createOrReplaceTempView('artists') # write artists table to parquet files artists_table.write.parquet( os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
def process_song_data(spark, input_data, output_data): """ Method to process song data and create tables: songs, artists :param spark: Spark session :param input_data: S3 bucket :param output_data: S3 bucket :return: Data frame of song data """ # get filepath to song data file song_data = input_data + '/song-data/A/A/B/*.json' songs_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) # read song data file print('Reading song data.') df = spark.read.json(song_data, schema=songs_schema) song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration'] # extract columns to create songs table songs_table = df.selectExpr(song_columns).dropDuplicates() # write songs table to parquet files partitioned by year and artist print('Writing songs to parquet.') write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id') artist_columns = [ 'artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude' ] # extract columns to create artists table artists_table = df.selectExpr(artist_columns).dropDuplicates() # write artists table to parquet files print('Writing artists to parquet.') write_parquet(artists_table, output_data, 'artists', None, None) return df
def process_song_data(spark, input_data, output_data): """ Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables and then loaded the processed data back to S3 (output_data) :param spark: Spark Session object :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format """ # Get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # Read song data file print("Reading song_data JSON files from S3") df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \ columnNameOfCorruptRecord='corrupt_record').dropDuplicates() print("Read completed") # Extract columns to create songs table songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \ .withColumn("song_id", monotonically_increasing_id()) print("Writing Songs table to S3 after processing") # Write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs/", mode="overwrite", partitionBy=["year", "artist_id"]) print("Completed") # Extract columns to create artists table artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \ .dropDuplicates() print("Writing Artists table to S3 after processing") # Write artists table to parquet files artists_table.write.parquet(output_data + "artists/", mode="overwrite") print("Completed")
def process_song_data(spark, input_data, output_data): """ read song data from s3 and then create the songs_table and artists_table. load them back to s3. parameters: spark: spark session input_data: path of song data output_data: path of output table """ # get filepath to song data file # song_data = input_data + "song_data/*/*/*/*.json" song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json" # create song table schema songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0 song_field = ["title", "duration", "year", "artist_id"] songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\ .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull()) # extract columns to create artists table, drop if artist_id and name containing any null values artist_field = [ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ] artists_table = df.select(artist_field).dropDuplicates().dropna( subset=["artist_id", "artist_name"]) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs/", mode="overwrite") # write artists table to parquet files artists_table.write.parquet(output_data + "artists/", mode="overwrite")
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + "./data/song_data/*/*/*/*.json" """Creating the song_data file schema that we are going to add to spark""" songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # reading song data file json structure df = spark.read.json(song_data, schema=songSchema) """Filtering out only the needed columns for the songs table""" song_fields = ["title", "artist_id", "year", "duration"] print('Creating the songs table and dropping duplicates') songs_table = df.select(song_fields).dropDuplicates().withColumn( "song_id", monotonically_increasing_id()) print( "--- All duplicate songs have been dropped and the songs table created ---" ) print('Printing some rows from the songs_table') songs_table.show(15) print('Saving the songs table to the s3 bucket') songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + "songs") print("--- songs.parquet completed ---") """Filtering out only the needed columns for the artists table""" artists_data = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ] print("--- Starting to drop duplicate artists....") artists_table = df.selectExpr(artists_data).dropDuplicates() print("All duplicate artists have been dropped......") print('Printing some rows from the artists_table') artists_table.show(15) """writing the artists table to the parquets file""" artists_table.write.parquet(output_data + "artists") print("--- artists.parquet completed ---") print("*** process_song_data completed ***\n\n")
def process_song_data(spark, input_data, output_data): """ Description: This function fetches song_data from S3 into a staging dataframe, then extracts the songs and artist tables, and eventually exports data back to S3 Parameters: spark : object for Spark Session input_data : location of song_data output_data : location of target S3 bucket """ # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # define schema songdata_schema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) # read song data file df = spark.read.json(song_data, schema=songdata_schema) # extract columns to create songs table songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration']) songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\ select(['song_id', 'artist_name', 'artist_id', 'year', 'duration']) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') # extract columns to create artists table selection = ['artist_id', 'artist_name as name', \ 'artist_location as location', 'artist_latitude as latitude', \ 'artist_longitude as longitude'] artists_table = df.selectExpr(selection).dropDuplicates() # write artists table to parquet files artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ process_song_data - Loads the song data files from S3, and saves the song information to a parquet file (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file. """ # Get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json') songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # Read song data file df = spark.read.json(song_data, schema=songSchema) # Extract columns to create songs table songs_table = df.select( ['song_id', 'title', 'artist_id', 'year', 'duration']) # Write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").mode('overwrite').parquet( os.path.join(output_data, 'songs.parquet')) # Extract columns to create artists table, and find the distinct artists artists_table = df.select([ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ]).withColumnRenamed('artist_name', 'name').withColumnRenamed( 'artist_location', 'location').withColumnRenamed( 'artist_latitude', 'latitude').withColumnRenamed('artist_longitude', 'longitude').distinct() # Write artists table to parquet files artists_table.write.mode('overwrite').parquet( os.path.join(output_data, 'artists.parquet'))
def process_song_data(spark, input_data, output_data): """ Loads song_data from S3, extracting needed columns for 'song_table' and 'artist_table' and writting their parquet format on S3 Parameters: spark : Spark Session input_data : Location of song_data json files with the songs metadata output_data : S3 bucket were tables in parquet format store """ # get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # Making right type for input json structure songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table songs_table = songs_table = df.selectExpr( "song_id", "title", "artist_id", "year", "duration").orderBy("song_id").drop_duplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet( os.path.join(output_data, 'songs')) # extract columns to create artists table artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude").orderBy( "artist_id").drop_duplicates() # write artists table to parquet files artists_table.write.parquet(os.path.join(output_data, 'artists'))
def process_song_data(spark, input_data_songs, output_data): """ Read song data by providing it an expected schema. Create songs and artists tables. """ # define song data schema to improve performance song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) song_data = input_data_songs t_start = time() dfSongs = spark.read.json(song_data, schema=song_schema) t_end = time() - t_start print('Read song data in {} secs'.format(t_end)) dfSongs.printSchema() dfSongs.count() dfSongs.show(5) songs_table = dfSongs.filter(dfSongs.song_id != '')\ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) songs_table.show(5) songs_table.write.partitionBy( "year", "artist_id").mode('overwrite').parquet(output_data + 'songs/songs_table.parquet') artists_table = dfSongs.filter(dfSongs.artist_id !='') \ .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\ .dropDuplicates() artists_table.show(5) artists_table.write.mode('overwrite').parquet( output_data + 'artists/artists_table.parquet')
def get_song_src_schema(): """ Get the source spark schema definition :return: The schema definition """ return R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ])
def process_song_data(spark, input_data, output_data): """ Read song data files in JSON-format from Amazon S3, load the processed data into two analytical tables, and write these tables as parquet files back to Amazon S3. """ # get filepath to song data file song_data = os.path.join(input_data, "song_data", *3*["*"], "*.json") #song_data = os.path.join("data", "song_data", *3*["*"], "*.json") # create song data schema song_data_schema = R([ Fld("artist_id", Str(), False), Fld("artist_latitude", Str(), True), Fld("artist_longitude", Str(), True), Fld("artist_location", Str(), True), Fld("artist_name", Str(), False), Fld("song_id", Str(), False), Fld("title", Str(), False), Fld("duration", Dbl(), False), Fld("year", Int(), False) ]) # read song data file df = spark.read.json(path=song_data, schema=song_data_schema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct() # write songs table to parquet files partitioned by year and artist songs_table.write.parquet( output_data + "songs_table.parquet", mode="overwrite", partitionBy=["year", "artist_id"] ) # extract columns to create artists table artists_table = df.select("artist_id", col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_latitude").alias("latitude"), col("artist_longitude").alias("longitude") ).distinct() # write artists table to parquet files artists_table.write.parquet(output_data + "artists_table.parquet", mode="overwrite")