Ejemplo n.º 1
0
def schema_song_data():
    """
    Description:
        Schema design for song datasets.
    """
    try:
        print("schema_song_data fuction is statrting.")
        print("**************************************")

        schema = R([
            Fld("artist_id", Str()),
            Fld("artist_latitude", Dbl()),
            Fld("artist_location", Str()),
            Fld("artist_longitude", Dbl()),
            Fld("artist_name", Str()),
            Fld("duration", Dbl()),
            Fld("num_songs", Int()),
            Fld("song_id", Str()),
            Fld("title", Str()),
            Fld("year", Int()),
        ])

        print("schema_song_data is successfull created")
        print("***************************************")
        return schema

    except:
        print("schema_song_data function is successful created.")
        print("************************************************")
Ejemplo n.º 2
0
def process_song_data(spark, input_data, output_data):
    '''
        Description: This function can be used to load the song data from the input S3 bucket
                     and write the parquet files to the output S3 bucket.
        Arguments:
            spark: SparkSession
            input_data: location for the input data
            output_data: location for the output data
        Returns:
            None
    '''
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    print(song_data)

    # read song data file
    songsSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    df = spark.read.json(song_data, schema=songsSchema).distinct()
    print(df.count())
    print(df.show(5, truncate=False))

    df.printSchema()

    # extract columns to create songs table

    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").distinct()
    songs_table.printSchema()
    songs_table.show(5)
    print('songs', songs_table.count())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + "songs")

    # extract columns to create artists table
    df.createOrReplaceTempView("df")
    artists_table = spark.sql(
        "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df"
    ).distinct()
    artists_table.printSchema()
    artists_table.show(5)
    print('artists', artists_table.count())

    # write artists table to parquet files
    artists_table.repartitionByRange(
        3,
        "artist_id").write.mode('overwrite').parquet(output_data + "artists")
Ejemplo n.º 3
0
def get_log_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """

    return R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Str()),
        Fld("sessionId", Int()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Str()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
Ejemplo n.º 4
0
def process_song_data(spark, input_data, output_data):
    """This function loads song_data from S3 and processes it by extracting the songs and artist tables
        and then again loaded back to S3
    Args:
        spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession
        input_data (str): S3 bucket where song files are stored
        output (str): S3 bucket file path to store resulting files

    Returns:
        None
    """
    print("**** Starting to process song data *****")
    # get filepath to song data file
    song_data = input_data+'song_data/*/*/*/*.json'
    
    # read song data file
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("song_id",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    
    try:
        df = spark.read.json(song_data, schema=songSchema)
    except Exception as e:
        print(e)
        
    # extract columns to create songs table
    songs_fields = ["song_id", "title", "artist_id", "year", "duration"]
    songs_table = df.select(songs_fields).dropDuplicates(["song_id"])
    
    # write songs table to parquet files partitioned by year and artist
    try:
        songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite")
    except Exception as e:
        print(e)
    
    print("**** songs table data load is complete *****")
    
    # extract columns to create artists table
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"]
    artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    try:
        artists_table.write.parquet(output_data + "artists.parquet",  mode="overwrite")
    except Exception as e:
        print(e)
    print("**** artists table data load is complete *****")
    
    print("**** song data processing is finished *****")
Ejemplo n.º 5
0
def read_and_process_airport_data(spark, filename, df_dimension_state_table):
    """ Load the airport codes join with state dimension data to get airports with state_key"""
    logging.info("Reading airport data")
    # load the airport codes so we can map them to states
    airport_schema = R([
        Fld("ident", Str()),
        Fld("type", Str()),
        Fld("name", Str()),
        Fld("elevation_ft", Int()),
        Fld("continent", Str()),
        Fld("iso_country", Str()),
        Fld("iso_region", Str()),
        Fld("municipality", Str()),
        Fld("gps_code", Str()),
        Fld("iata_code", Str()),
        Fld("local_code", Str()),
        Fld("coordinates", Str())
    ])

    df_airport = spark.read.options(Header=True,
                                    Delimter=",").csv(filename, airport_schema)

    # cleanse: we only want the airports in the US which map to the states that we have in the states table

    df_airport = df_airport.filter(df_airport.iso_country == "US") \
        .join(df_dimension_state_table, F.substring(df_airport.iso_region, 4, 2) == df_dimension_state_table.state_key,
              "inner") \
        .select(df_airport.ident, df_airport.local_code, df_dimension_state_table.state_key)

    return df_airport
Ejemplo n.º 6
0
def create_log_schema():
    """
    Schema structure for log data
    :return: StructType
    """
    log_schema = R([
        Fld('artist', St()),
        Fld('auth', St()),
        Fld('firstName', St()),
        Fld('gender', St()),
        Fld('itemInSession', LInt()),
        Fld('lastName', St()),
        Fld('length', Fl()),
        Fld('level', St()),
        Fld('location', St()),
        Fld('method', St()),
        Fld('page', St()),
        Fld('registration', Dbl()),
        Fld('sessionId', LInt()),
        Fld('song', St()),
        Fld('status', LInt()),
        Fld('ts', LInt()),
        Fld('userAgent', St()),
        Fld('userId', St())
    ])

    return log_schema
Ejemplo n.º 7
0
def get_log_schema():
    """
    Creates a schema for log data.
    
    :return: schema
    """
    log_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Str()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Str()),
        Fld("song", Str()),
        Fld("status", Str()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
    return log_schema
Ejemplo n.º 8
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id","year", "duration"]
    
    # extract columns to create songs table
    songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
    
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"]

    # extract columns to create artists table
    artists_table = df.selectExpr(artists_fields).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists/')
Ejemplo n.º 9
0
def read_city_demographic_data(spark, filename):
    """ read named city data file into spark dataframe """
    logging.info("Reading demographic data: {}".format(filename))
    demographic_schema = R([
        Fld("City", Str()),
        Fld("State", Str()),
        Fld("Median Age", Ft()),
        Fld("Male Population", Int()),
        Fld("Female Population", Int()),
        Fld("Total Population", Int()),
        Fld("Number of Veterans", Int()),
        Fld("Foreign Born", Int()),
        Fld("Average Household Size", Ft()),
        Fld("State Code", Str()),
        Fld("Race", Str()),
        Fld("Count", Int())
    ])
    try:
        df_demographic = spark.read.options(Header=True, Delimiter=";").csv(
            filename, demographic_schema)
    except EOFError as ex:
        logging.exception("End of file exception:")
        print(ex)
    except FileNotFoundError as ex:
        logging.exception("File not found exception:")
        print(ex)
    return df_demographic
Ejemplo n.º 10
0
def immigration_etl(source="s3://immigration-us-1/sas_data",
                    destination="s3://immigration-us-1/sas_data_ready_to_use",
                    country_dict="s3://immigration-us-1/raw_data/country_dict.csv",
                    visa_dict="s3://immigration-us-1/raw_data/visa_dict.csv"
                   ):
    # Local udfs
    
    spaceDeleteUDF = udf(lambda s: s.replace(" ", ""), Str())
    ampesandDeleteUDF = udf(lambda s: s.replace("'", ""), Str())
    udf_to_datetime_sas = udf(lambda x: date_converter(x), DateType())
   
    Schema_country = R([
        Fld("id",Str()),
        Fld("country",Str())
        ])
    Schema_Visa = R([
        Fld("id",Str()),
        Fld("Visa_Type",Str())
        ])

    df_spark=spark.read.parquet(source)
    # Only immigrants from air 
    immigrants=df_spark.where(F.col("i94mode")==1)
    immigrants=immigrants.select("cicid", "i94yr", "i94mon", "i94cit", "i94res", "i94port", "arrdate", "i94visa",  "biryear", "gender", "visatype", "airline")
    country_dict=spark.read.csv(country_dict, header=True, mode="DROPMALFORMED", sep="=",schema=Schema_country)
    
    immigrants = immigrants.join(country_dict, immigrants.i94cit == country_dict.id,how='right') 
    immigrants=immigrants.withColumnRenamed("country", "cit_country")
    immigrants=immigrants.drop('id','i94cit')
    
    immigrants = immigrants.join(country_dict, immigrants.i94res == country_dict.id,how='right') 
    immigrants=immigrants.withColumnRenamed("country", "res_country")
    immigrants=immigrants.drop('id','i94res')
    
    visa_dict=spark.read.csv(visa_dict, header=False, mode="DROPMALFORMED",sep="=", schema=Schema_Visa)
    immigrants = immigrants.join(visa_dict, immigrants.i94visa == visa_dict.id,how='right') 
    immigrants=immigrants.drop('id','i94visa')
    
    immigrants=immigrants.withColumn("arrdate", udf_to_datetime_sas("arrdate"))
    
    immigrants = immigrants.withColumn("cicid", immigrants["cicid"].cast(IntegerType()))
    immigrants = immigrants.withColumn("i94yr", immigrants["i94yr"].cast(IntegerType()))
    immigrants = immigrants.withColumn("biryear", immigrants["biryear"].cast(IntegerType()))
    immigrants = immigrants.withColumn("i94mon", immigrants["i94mon"].cast(IntegerType()))
    
    immigrants.show(10) 
    immigrants.write.parquet(destination)
def process_song_data(spark, input_data, output_data):
    """
    Description: This function helps us to read the song_data from S3, put this data into spark dataframe
                 ,extract columns from this dataframe to form "songinf table" and "artist table", transform 
                 "songinf table data" and "artist table data" into a format that this project needs.
    Parameters: -spark: spark session
                -input_data: location of song_data json file (in S3 bucket)
                -output_data: location that the final table will be saved (in S3 bucket)
    Return: None
    """
    #--------------------read song data--------------------#
    print('Read song_data...')
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data_*.json")
    
    # define the song data schema for reading
    SongSchema = R([
                    Fld("artist_id",Str()),
                    Fld("artist_latitude",Doub()),
                    Fld("artist_location",Str()),
                    Fld("artist_longitude",Doub()),
                    Fld("artist_name",Str()),
                    Fld("duration",Doub()),
                    Fld("num_songs",Long()),
                    Fld("song_id",Str()),
                    Fld("title",Str()),
                    Fld("year",Long())
                    ])
    
    # read song data file
    song_df = spark.read.json(song_data, schema=SongSchema)
    
    #--------------------deal with song table--------------------#
    # extract columns to create songinf df
    songinf_df = song_df.select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songinf_df = songinf_df.dropDuplicates(['song_id'])
    songinf_df = songinf_df.dropna(how = "any", subset = ["song_id"])
    songinf_df = songinf_df.filter(songinf_df.song_id != "")
    
    print('Songs table: ')
    print(songinf_df.sort('song_id').show(5))
    
    # write songs table to parquet files partitioned by year and artist
    print('Save Songs table into S3...')
    songinf_df.write.partitionBy("year", "artist_id").parquet("{}/song_table.parquet".format(output_data))

    #--------------------deal with artists table--------------------#
    # extract columns to create artists df
    artist_df = song_df.select(['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude'])
    artist_df = artist_df.dropDuplicates(['artist_id'])
    artist_df = artist_df.dropna(how = "any", subset = ["artist_id"])
    artist_df = artist_df.filter(artist_df.artist_id != "")
    
    print('artists table: ')
    print(artist_df.sort('artist_id').show(5))
    
    # write artists table to parquet files
    print('Save artists table into S3...')
    artist_df.write.parquet("{}/artist_table.parquet".format(output_data))
Ejemplo n.º 12
0
def set_data_schema(schema):
    """ 
        This Function return the schema definition
        Param:
            schema name
        Output:
            schema
    """
    print("...Setting data schema")
    try:
        songtSchema = R([    
            Fld("artist_id", Str()), 
            Fld("artist_latitude", Str()), 
            Fld("artist_longitude", Str()), 
            Fld("artist_location", Str()),          
            Fld("artist_name", Str()), 
            Fld("song_id", Str()), 
            Fld("title", Str()), 
            Fld("duration", Dbl()),
            Fld("year", Int())
        ]) 

        eventSchema = R([  
            Fld("artists", Str()),
            Fld("auth", Str()),
            Fld("first_name", Str()),
            Fld("gender", Str()),
            Fld("item_in_session", Int()),
            Fld("last_name", Str()),
            Fld("lenght", Str()),
            Fld("level", Str()),
            Fld("location", Str()),
            Fld("method", Str()),
            Fld("page", Str()),
            Fld("registration", Str()),
            Fld("session_id", Int()),
            Fld("song", Str()),
            Fld("status", Int()),
            Fld("ts", Str()),
            Fld("user_agente", Str()),
            Fld("user_id", Int())
        ])
        create_schema = {'Song': songtSchema, 'Event': eventSchema}
        return create_schema[schema]
    except:
        return "Error schema not exists"
Ejemplo n.º 13
0
def process_song_data(spark, input_data, output_data):
    """
    This function processes the song data of sparkify and creates
    facts/dimensions via spark and saves them to our data lake afterwards
	Arguments:
	    spark {SparkSession}: Spark session to launch the program
	    input_data {str}: location (local/s3) where the (root) input song data resides
	    output_data {str}: location (local/s3) where the (root) output files should be written
    """
    # get filepath to song data file
    # song_data = f"{input_data}song_data/A/A/A/*.json"
    song_data = f"{input_data}song_data/*/*/*/*.json"

    # read song data file
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Str()),
        Fld("artist_longitude", Str()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # since schema can not infered automatically, we need to specify it beforehand
    df_song = spark.read.json(song_data, schema=songSchema)
    df_song.cache()

    # extract columns to create songs table
    songs_table = df_song.filter(df_song.song_id != '') \
        .select(['song_id',
                 'title',
                 'artist_id',
                 'year',
                 'duration']) \
        .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    output_song_data = f"{output_data}song_data/"
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_song_data)

    # extract columns to create artists table
    artists_table = df_song.filter(df_song.artist_id != '') \
        .selectExpr(['artist_id',
                     'artist_name as name',
                     'artist_location as location',
                     'artist_latitude as latitude',
                     'artist_longitude as longitude']) \
        .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    output_artist_data = f"{output_data}artist_data/"
    artists_table.write.mode('overwrite').parquet(output_artist_data)
Ejemplo n.º 14
0
def process_song_data(spark, input_data, output_data):
    """import Song dataset extract columns and create songs and artist tables
    write those tables to parquet files
    
    Parameters:
    spark: name of spark session
    input_data: location of the source data s3 bucket 
    output_data: location of the destination data s3 bucket
    
    Returns:
    writes songs table in parquet to output_data location + songs
    writes artist_table in parquet to output_dat location + artists
    
    """

    # Setting up the JSON table structure for the Song dataset
    song_dataset_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Str()),
    ])
    """get filepath to song data file 
    use "song_data/*/*/*/*.json" for full dataset
    use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record

    """
    song_data = input_data + "song_data/*/*/*/*.json"

    # read song data file with dataset_schema
    df = spark.read.json(song_data, schema=song_dataset_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'artist_id', 'year', 'duration')

    # drop duplicate rows in songs table
    songs_table = songs_table.dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('append').partitionBy(
        'year', 'artist_id').parquet(output_data + "songs")

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude')

    # drop duplicate rows in artists table
    artists_table = artists_table.dropDuplicates()

    # write artists table to parquet files
    artists_table.write.mode('append').parquet(output_data + "artists")
Ejemplo n.º 15
0
def get_residence_cities(spark):
    cities = pd.read_csv('residence_city.txt',
                         sep='=',
                         names=['id', 'country'])
    cities['country'] = cities['country'].str.replace("'", '').str.strip()
    cities_data = cities.values.tolist()
    cities_schema = R([Fld('id', Str(), True), Fld('country', Str(), True)])
    cities = spark.createDataFrame(cities_data, cities_schema)
    cities.write.mode('overwrite').parquet('resident_city.parquet')
    return cities
Ejemplo n.º 16
0
def process_song_data(spark, input_data, output_data):
    """
    This function loads the songs JSON dataset from S3, 
    then uses the data to create the songs and artists tables
    
    Input:
    spark = SparkSession object
    input_data = Start of path variable for input files
    output_data = Start of path variable for output files
    
    Output: None
    """

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Define schema
    SongSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=SongSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropduplicates()

    # write songs table to parquet files partitioned by year and artist
    output_path = os.path.join(output_data, 'songs_table.parquet')
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_path,
                                                       mode="overwrite")

    # extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location",
                              "artist_latitude",
                              "artist_longitude").dropduplicates()

    # write artists table to parquet files
    output_path = os.path.join(output_data, 'artists_table.parquet')
    artists_table.write.parquet(output_path, mode="overwrite")

    #export whole songs data file to parquet
    output_path = os.path.join(output_data, 'songs_data_table.parquet')
    df.write.parquet(output_path, mode="overwrite")
Ejemplo n.º 17
0
def process_song_data(spark, input_data, output_data):
    '''
    load song data in json format from S3 bucket and process these data by extracting 
    songs table and artists table, and save these tables back to S3 bucket
    
    :param spark: spark session
    :param input_data: data location for input data
    :param output_data: data location for output data
    :return: no return value
    '''
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # create songs schema
    songSchema = R([
        Fld('artist_id', Str()),
        Fld('artist_latitude', Dbl()),
        Fld('artist_location', Str()),
        Fld('artist_longitude', Dbl()),
        Fld('artist_name', Str()),
        Fld('duration', Dbl()),
        Fld('num_songs', Int()),
        Fld('title', Str()),
        Fld('year', Int()),
    ])

    # load songs json files from S3
    df_songs = spark.read.json(song_data, schema=songSchema)

    # select columns for songs_table
    songs_attr = ['title', 'artist_id', 'year', 'duration']
    songs_table = df_songs.select(songs_attr)\
    .dropDuplicates()\
    .withColumn('song_id', monotonically_increasing_id())

    # write songs_table to S3
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + 'songs/')

    # select artists columns
    artists_attr = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]
    artists_table = df_songs.select(artists_attr)\
    .dropDuplicates()

    artists_table = artists_table\
    .withColumnRenamed('artist_name','name')\
    .withColumnRenamed('artist_location','location')\
    .withColumnRenamed('artist_latitude','latitude')\
    .withColumnRenamed('artist_longitude','longitude')

    # write artists_table to S3
    artists_table.write.parquet(output_data + 'artists/')
Ejemplo n.º 18
0
def process_song_data(spark, input_data, output_data):
    """Process song data, transform the data into songs and artists tables
    and store it in parquet files on S3.

    Parameters
    ----------
    spark : SparkSession
        cursor to the sparkify database connection
    input_data : string
        input data prepend path
    output_data : string
        output data prepend path
    """
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, song_schema)

    # extract columns to create songs table
    songs_table = df.select(
        ["song_id", "title", "artist_id", "year", "duration"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \
        .parquet(os.path.join(output_data, 'analytics/songs'))

    # extract columns to create artists table
    artists_table = df.select([
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ])
    artists_table = artists_table.withColumnRenamed("artist_name", "name") \
        .withColumnRenamed("artist_location", "location") \
        .withColumnRenamed("artist_latitude", "latitude") \
        .withColumnRenamed("artist_longitude", "longitude")

    # write artists table to parquet files
    artists_table.write.mode("overwrite") \
        .parquet(os.path.join(output_data, 'analytics/artists'))
Ejemplo n.º 19
0
def process_song_data(spark, input_data, output_data):
    """
    Process the songs data from S3 storage and create the analytical tables, songs table and artists table.
    
    This function read the data in json files from the S3 storage, transforme the data into tha analytcal tables
    (songs and artists), and write it into partitioned parquet files on S3.
    
    Args:
        spark: the spark session
        input_data: the S3 bucket to read data from
        output_data: the S3 bucket to write analytics tables to
    """
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    # defined the song data schema
    song_data_schema = R([
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Str(), True),
        Fld("artist_longitude", Str(), True),
        Fld("artist_location", Str(), True),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int(), False)
    ])

    # read song data file
    df = spark.read.json(song_data, schema=song_data_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").distinct()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs_table.parquet",
                              mode="overwrite",
                              partitionBy=["year", "artist_id"])

    # extract columns to create artists table
    artists_table = df.select(
        "artist_id",
        col("artist_name").alias("name"),
        col("artist_location").alias("location"),
        col("artist_latitude").alias("latitude"),
        col("artist_longitude").alias("longitude"),
    ).distinct()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table.parquet",
                                mode="overwrite")
Ejemplo n.º 20
0
def process_song_data(spark, input_data, output_data):
    """
    The function loads data from song_data dataset and extract columns
    for songs and artist tables and write the data into parquet
    files which will be loaded on s3.
    
    """
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    # get filepath to song data file
    song_data = 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(os.path.join(input_data, song_data),
                         schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'title', 'artist_id', 'year',
                            'duration').dropDuplicates()

    songs_table.createOrReplaceTempView('songs')

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs/songs.parquet'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude') \
                      .withColumnRenamed('artist_name', 'name') \
                      .withColumnRenamed('artist_location', 'location') \
                      .withColumnRenamed('artist_latitude', 'latitude') \
                      .withColumnRenamed('artist_longitude', 'longitude') \
                      .dropDuplicates()

    artists_table.createOrReplaceTempView('artists')

    # write artists table to parquet files
    artists_table.write.parquet(
        os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
def process_song_data(spark, input_data, output_data):
    """
    Method to process song data and create tables: songs, artists
    :param spark: Spark session
    :param input_data: S3 bucket
    :param output_data: S3 bucket
    :return: Data frame of song data
    """
    # get filepath to song data file
    song_data = input_data + '/song-data/A/A/B/*.json'

    songs_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    # read song data file
    print('Reading song data.')
    df = spark.read.json(song_data, schema=songs_schema)

    song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration']

    # extract columns to create songs table
    songs_table = df.selectExpr(song_columns).dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    print('Writing songs to parquet.')
    write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id')

    artist_columns = [
        'artist_id', 'artist_name as name', 'artist_location as location',
        'artist_latitude as latitude', 'artist_longitude as longitude'
    ]

    # extract columns to create artists table
    artists_table = df.selectExpr(artist_columns).dropDuplicates()

    # write artists table to parquet files
    print('Writing artists to parquet.')
    write_parquet(artists_table, output_data, 'artists', None, None)

    return df
Ejemplo n.º 22
0
def process_song_data(spark, input_data, output_data):
    """
    Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables
    and then loaded the processed data back to S3 (output_data)
    
    :param spark: Spark Session object
    :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files
    :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format 
    """

    # Get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # Read song data file
    print("Reading song_data JSON files from S3")
    df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \
                         columnNameOfCorruptRecord='corrupt_record').dropDuplicates()
    print("Read completed")

    # Extract columns to create songs table
    songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \
                    .withColumn("song_id", monotonically_increasing_id())

    print("Writing Songs table to S3 after processing")
    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs/",
                              mode="overwrite",
                              partitionBy=["year", "artist_id"])
    print("Completed")

    # Extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \
                        .dropDuplicates()

    print("Writing Artists table to S3 after processing")
    # Write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
    print("Completed")
Ejemplo n.º 23
0
def process_song_data(spark, input_data, output_data):
    """
    read song data from s3 and then create the songs_table and artists_table. load them back to s3.
    
    parameters:
    spark: spark session
    input_data: path of song data
    output_data: path of output table
    
    """
    # get filepath to song data file
    # song_data = input_data + "song_data/*/*/*/*.json"
    song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json"

    # create song table schema
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0
    song_field = ["title", "duration", "year", "artist_id"]
    songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\
    .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull())

    # extract columns to create artists table, drop if artist_id and name containing any null values
    artist_field = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ]
    artists_table = df.select(artist_field).dropDuplicates().dropna(
        subset=["artist_id", "artist_name"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_data + "songs/",
                                                       mode="overwrite")

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
Ejemplo n.º 24
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + "./data/song_data/*/*/*/*.json"
    """Creating the song_data file schema that we are going to add to spark"""
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # reading song data file json structure
    df = spark.read.json(song_data, schema=songSchema)
    """Filtering out only the needed columns for the songs table"""
    song_fields = ["title", "artist_id", "year", "duration"]

    print('Creating the songs table and dropping duplicates')
    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())
    print(
        "--- All duplicate songs have been dropped and the songs table created ---"
    )
    print('Printing some rows from the songs_table')
    songs_table.show(15)
    print('Saving the songs table to the s3 bucket')
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + "songs")
    print("--- songs.parquet completed ---")
    """Filtering out only the needed columns for the artists table"""
    artists_data = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]

    print("--- Starting to drop duplicate artists....")
    artists_table = df.selectExpr(artists_data).dropDuplicates()
    print("All duplicate artists have been dropped......")

    print('Printing some rows from the artists_table')
    artists_table.show(15)
    """writing the artists table to the parquets file"""
    artists_table.write.parquet(output_data + "artists")
    print("--- artists.parquet completed ---")
    print("*** process_song_data completed ***\n\n")
Ejemplo n.º 25
0
def process_song_data(spark, input_data, output_data):
    """
        Description: This function fetches song_data from S3 into a staging dataframe, 
        then extracts the songs and artist tables,
        and eventually exports data back to S3
        
        Parameters:
            spark       : object for Spark Session
            input_data  : location of song_data 
            output_data : location of target S3 bucket
            
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    # define schema
    songdata_schema = R([
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dbl()),
    Fld("artist_location",Str()),
    Fld("artist_longitude",Dbl()),
    Fld("artist_name",Str()),
    Fld("duration",Dbl()),
    Fld("num_songs",Int()),
    Fld("title",Str()),
    Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songdata_schema)

    # extract columns to create songs table
    songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration'])

    songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\
    select(['song_id', 'artist_name', 'artist_id', 'year', 'duration'])
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/')

    # extract columns to create artists table
    selection = ['artist_id', 'artist_name as name', \
                 'artist_location as location', 'artist_latitude as latitude', \
                 'artist_longitude as longitude']
    artists_table = df.selectExpr(selection).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
Ejemplo n.º 26
0
def process_song_data(spark, input_data, output_data):
    """
    process_song_data - Loads the song data files from S3, and saves the song information to a parquet file
    (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file.
    """

    # Get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
    #    song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json')

    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # Read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # Extract columns to create songs table
    songs_table = df.select(
        ['song_id', 'title', 'artist_id', 'year', 'duration'])

    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").mode('overwrite').parquet(
                                      os.path.join(output_data,
                                                   'songs.parquet'))

    # Extract columns to create artists table, and find the distinct artists
    artists_table = df.select([
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]).withColumnRenamed('artist_name', 'name').withColumnRenamed(
        'artist_location', 'location').withColumnRenamed(
            'artist_latitude',
            'latitude').withColumnRenamed('artist_longitude',
                                          'longitude').distinct()

    # Write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(
        os.path.join(output_data, 'artists.parquet'))
Ejemplo n.º 27
0
def process_song_data(spark, input_data, output_data):
    """
		Loads song_data from S3, extracting needed columns for 'song_table' and 'artist_table' 
		and writting their parquet format on S3
		
		Parameters:
			spark       : Spark Session
			input_data  : Location of song_data json files with the songs metadata
			output_data : S3 bucket were tables in parquet format store
	"""

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Making right type for input json structure
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = songs_table = df.selectExpr(
        "song_id", "title", "artist_id", "year",
        "duration").orderBy("song_id").drop_duplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs'))

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name",
                                  "artist_location as location",
                                  "artist_latitude as latitude",
                                  "artist_longitude as longitude").orderBy(
                                      "artist_id").drop_duplicates()

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'))
Ejemplo n.º 28
0
def process_song_data(spark, input_data_songs, output_data):
    """
    Read song data by providing it an expected schema.
    Create songs and artists tables.
    """
    # define song data schema to improve performance
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    song_data = input_data_songs

    t_start = time()
    dfSongs = spark.read.json(song_data, schema=song_schema)
    t_end = time() - t_start
    print('Read song data in {} secs'.format(t_end))
    dfSongs.printSchema()

    dfSongs.count()
    dfSongs.show(5)

    songs_table = dfSongs.filter(dfSongs.song_id != '')\
                     .select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songs_table.show(5)
    songs_table.write.partitionBy(
        "year",
        "artist_id").mode('overwrite').parquet(output_data +
                                               'songs/songs_table.parquet')

    artists_table = dfSongs.filter(dfSongs.artist_id !='') \
                        .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"),
                                 col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\
                        .dropDuplicates()

    artists_table.show(5)

    artists_table.write.mode('overwrite').parquet(
        output_data + 'artists/artists_table.parquet')
Ejemplo n.º 29
0
def get_song_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """
    return R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])
Ejemplo n.º 30
0
def process_song_data(spark, input_data, output_data):
    """
    Read song data files in JSON-format from Amazon S3,
    load the processed data into two analytical tables,
    and write these tables as parquet files back to Amazon S3.
    """
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data", *3*["*"], "*.json")
    #song_data = os.path.join("data", "song_data", *3*["*"], "*.json")
    
    # create song data schema
    song_data_schema = R([
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Str(), True),
        Fld("artist_longitude", Str(), True),
        Fld("artist_location", Str(), True),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int(), False)
    ])
    
    # read song data file
    df = spark.read.json(path=song_data, schema=song_data_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct()
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(
        output_data + "songs_table.parquet",
        mode="overwrite",
        partitionBy=["year", "artist_id"]
    )

    # extract columns to create artists table
    artists_table = df.select("artist_id",
                              col("artist_name").alias("name"),
                              col("artist_location").alias("location"),
                              col("artist_latitude").alias("latitude"),
                              col("artist_longitude").alias("longitude")
                             ).distinct()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table.parquet", mode="overwrite")