Exemple #1
0
def create_log_data():
    """
    Create schema for log data.
    
    return: schema
    """
    log_schema = StructType([
        StructField("artist", Str()), 
        StructField('auth', Str()),
        StructField('firstName', Str()),
        StructField('gender', Str()),
        StructField('itemInSession', Int()),
        StructField('lastName', Str()),
        StructField('length', Dbl()),
        StructField('level', Str()),
        StructField('location', Str()),
        StructField('method', Str()),
        StructField('page', Str()),
        StructField('registration', Dec()),
        StructField('sessionId', Int()),
        StructField('song', Str()),
        StructField('status', Int()),
        StructField('ts', Long()),
        StructField('userAgent', Str()),
        StructField('userId', Int())
    ])
    return log_schema
Exemple #2
0
def get_song_schema():
    """
    Creates a schema for song data.
    
    :return: schema
    """
    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dec()),
        Fld("artist_longitude", Dec()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])
    return song_schema
Exemple #3
0
def create_song_schema():
    """
    Create schema for song data.
    
    return: schema
    """
    song_schema = StructType([
        StructField("num_songs", Int()),
        StructField("artist_id", Str()),
        StructField("artist_latitude", Dec()),
        StructField("artist_longitude", Dec()),
        StructField("artist_location", Str()),
        StructField("artist_name", Str()),
        StructField("song_id", Str()),
        StructField("title", Str()),
        StructField("duration", Dbl()),
        StructField("year", Int())
    ])
    return song_schema
Exemple #4
0
def get_song_schema():
    """
    Description: Creates a schema for song data.
    Arguments: None
    Returns: song dataset schema

    """
    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dec()),
        Fld("artist_longitude", Dec()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])
    return song_schema
Exemple #5
0
def process_song_data(spark, input_data, output_data, aws=True):
    """
    Process song data by iterating over the .json files in
    the song_data folder in the input_data folder. 
    Creates songs and artists tables.
    If aws is True, writes the tables to the S3 bucket 
    give in the output_data
    
    key arg:
        - spark: the spark session
        - input_data: the path to the folder containing the song_data
        - output_data: the path to the S3 bucket where to write the tables
        - aws: to set to true when the script is executed on the cluster. 
            Set to False when executing locally for debugging
    """
    
    # Define the song schema before importing data
    songsSchema = R([
        StructField("num_songs",Int()),
        StructField("artist_id",Str()),
        StructField("artist_latitude",Dec()),
        StructField("artist_longitude",Dec()),
        StructField("artist_location",Str()),
        StructField("artist_name",Str()),
        StructField("song_id",Str()),
        StructField("title",Str()),
        StructField("duration",Dbl()),
        StructField("year",Int()),
])
    
    
    
    
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"
    
    # read song data file
    print("Reading song_data from {}\n".format(song_data))
    df = spark.read.json(song_data,schema=songsSchema)
    
    # extract columns to create songs table
    print("Extracting columns to create the songs table...\n")
    df.createOrReplaceTempView("songs_data_table")
    songs_table = spark.sql('''
        SELECT DISTINCT song_id, title, artist_id, year, duration
        FROM songs_data_table
    ''')
    print("done.")
    
    
    # print song table schema
    print("Songs table schema:\n")
    songs_table.printSchema()
    
    # write songs table to parquet files partitioned by year and artist
    if aws:
        print("Writing the songs table to parquet files partitioned by year and artist...\n")
        songs_table.write.parquet(output_data + "songs_table.parquet",
                                 partitionBy = ["year", "artist_id"],
                                 mode = "overwrite")
        print("done.")

    # extract columns to create artists table
    print("Extracting columns to create the artists table...\n")
    artists_table = spark.sql('''
        SELECT DISTINCT artist_id, artist_name AS name, artist_location AS location, artist_latitude AS latitude, artist_longitude AS longitude
        FROM songs_data_table
    ''')
    print("done.")
    
    # print artists table schema
    print("Artists table schema:\n")
    artists_table.printSchema()

    # write artists table to parquet files
    if aws:
        print("Writing the artists table to parquet files ...\n")
        artists_table.write.parquet(output_data + "artists_table.parquet",
                                  mode = "overwrite")
        print("done.")
Exemple #6
0
def process_log_data(spark, input_data, output_data, aws=True):   
    """
    Process log and song data by iterating over the 
    - the .json files in the log_data folder
    - the .json files in the song_data folder. 
    
    Creates the users, times and songplays tables.
    If aws is True, writes the tables to the S3 bucket 
    give in the output_data
    
    key arg:
        - spark: the spark session
        - input_data: the path to the folder containing the song_data
        - output_data: the path to the S3 bucket where to write the tables
        - aws: to set to true when the script is executed on the cluster. 
            Set to False when executing locally for debugging
    """
    
    songsSchema = R([
        StructField("num_songs",Int()),
        StructField("artist_id",Str()),
        StructField("artist_latitude",Dec()),
        StructField("artist_longitude",Dec()),
        StructField("artist_location",Str()),
        StructField("artist_name",Str()),
        StructField("song_id",Str()),
        StructField("title",Str()),
        StructField("duration",Dbl()),
        StructField("year",Int()),
    ])
    
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"
    
    # read log data file
    print("Reading log_data from {}\n".format(log_data))
    df = spark.read.json(log_data)
    print("done.")
    
    # filter by actions for song plays
    print("Filter by actions for song plays...")
    df = df.filter(df.page=='NextSong')
    df.createOrReplaceTempView("logs_data_table")
    print("done.")
    
    # extract columns for users table   
    print("Extract columns for users table...")
    users_table = spark.sql('''
        SELECT DISTINCT userId as user_id, firstName as first_name, lastName as last_name, gender, level
        FROM logs_data_table
    ''')
    users_table = users_table.dropDuplicates(["user_id"])
    print("done.")

    
    # write users table to parquet files
    if aws: 
        print("Write users table to parquet files...")
        users_table.write.parquet(output_data + "users_table.parquet",
                                 mode = "overwrite") 
        print("done.")

    # create datetime column from original timestamp column 
    print("Create datetime column from original timestamp column...")
    get_datetime = udf(lambda time: datetime.fromtimestamp((time/1000.0)), Date())
    df = df.withColumn("date",get_datetime("ts")) 
    print("done.")
    
    
    # create timestamp column from original timestamp column 
    print("Create timestamp column from original timestamp column...")
    convert_ts = udf(lambda time: datetime.fromtimestamp((time/1000.0)), TimestampType())
    df = df.withColumn("ts",convert_ts("ts")) 
    print("done.")

    # extract columns to create time table 
    print("Extract columns to create time table...")
    df.createOrReplaceTempView("clean")
    time_table = spark.sql('''
        SELECT ts AS start_time, 
            date_format(date,'YYYY') AS year,
            date_format(date,'MM') AS month,
            date_format(date,'dd') AS day,
            date_format(date,'w') AS week,
            date_format(ts,'E') AS weekday,
            HOUR(ts) AS hour
        FROM clean
    ''').dropDuplicates(["start_time"])
    print("done.")
    
    # write time table to parquet files partitioned by year and month
    if aws: 
        print("Write time table to parquet files partitioned by year and month...")
        time_table.write.parquet(output_data + "songs_table.parquet",
                                 partitionBy = ["year", "month"],
                                 mode = "overwrite") 
        print("done.")

    # read in song data to use for songplays table 
    print("Read in song data to use for songplays table...")
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data, schema=songsSchema)
    song_df.createOrReplaceTempView("songs_data_table") 
    print("done.")

    # extract columns from joined song and log datasets to create songplays table  
    print("Extract columns from joined song and log datasets to create songplays table...")
    artists_table = spark.sql('''
        SELECT DISTINCT artist_id, artist_name AS name, artist_location AS location, artist_latitude AS latitude, artist_longitude AS longitude
        FROM songs_data_table
    ''')
    artists_table.createOrReplaceTempView("artists") 
    print("done.")
    
     
    print("Extract columns to create songplays table...")
    songplays_table = spark.sql('''
        SELECT 
            year(l.ts) AS year,
            month(l.ts) AS month,
            l.ts AS start_time,
            l.userId AS user_id,
            l.level,
            s.song_id,
            a.artist_id,
            l.sessionId AS session_id,
            l.location,
            l.userAgent AS user_agent
        FROM clean AS l
        JOIN songs_data_table AS s 
        ON (l.song = s.title AND l.artist = s.artist_name)  
        JOIN artists AS a ON a.artist_id=s.artist_id
        LIMIT 5
    ''')
    print("done.")
    
    print("Create songplays_id...")
    songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id())
    print("done.")

    # write songplays table to parquet files partitioned by year and month
    if aws: 
        print("Write songplays table to parquet files partitioned by year and month...")
        songplays_table.write.parquet(output_data + "songs_table.parquet",
                             partitionBy = ["year", "month"],
                             mode = "overwrite")
        print("done.")
Exemple #7
0
from pyspark.sql.types import LongType as Long, DecimalType as Dec, StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date
songSchema = R([
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dec(10,5)),
    Fld("artist_location",Str()),
    Fld("artist_longitude", Dec(10,5)),
    Fld("artist_name",Str()),
    Fld("duration",Dec(10,5)),
    Fld("num_songs",Int()),
    Fld("song_id", Str()),
    Fld("title",Str()),
    Fld("year",Int()),
])

logSchema = R([
    Fld("artist",Str()),
    Fld("auth", Str()),
    Fld("firstName",Str()),
    Fld("gender", Str()),
    Fld("itemInSession",Int()),
    Fld("lastName",Str()),
    Fld("length",Dec(10,5)),
    Fld("level", Str()),
    Fld("location",Str()),
    Fld("method",Str()),
    Fld("page",Str()),
    Fld("registration", Dbl()),
    Fld("sessionId",Int()),
    Fld("song", Str()),
    Fld("status",Str()),
    Fld("ts",Long()),
    def demographics_to_parquet(spark, src, dest, joins):
        @udf('string')
        def gen_demo_id(race):
            if race == "Black or African-American":
                return "BAA"
            elif race == "Hispanic or Latino":
                return "HL"
            elif race == "White":
                return "W"
            elif race == "Asian":
                return "A"
            elif race == "American Indian and Alaska Native":
                return "AI"
            else:
                return "O"

        cSchema = StructType([
            Fld("demographics_id", Str(), False),
            Fld("port_id", Str(), False),
            Fld("city", Str(), True),
            Fld("state", Str(), True),
            Fld("median_age", Dec(4, 1), True),
            Fld("male_population", Int(), True),
            Fld("female_population", Int(), True),
            Fld("total_population", Int(), True),
            Fld("avg_household_size", Dec(3, 2), True),
            Fld("foreign_born", Int(), True),
            Fld("race", Str(), True),
            Fld("race_code", Str(), True)
        ])

        demographics_df = spark.read.csv(src, sep=";", header=True)

        port_df = spark.read.parquet(joins[0])

        demographics_df = demographics_df.withColumn('race_code',
                                                     gen_demo_id("Race"))

        joined_df = demographics_df.join(port_df, [
            upper(demographics_df['City']) == upper(port_df['city']),
            trim(demographics_df['State Code']) == trim(port_df['state'])
        ])

        joined_df = joined_df.select(demographics_df["*"], port_df['port_id'])

        joined_df = joined_df.withColumn(
            'demographics_id',
            concat(col("port_id"), lit("-"), col("race_code")))

        joined_df = joined_df.selectExpr(
            "demographics_id", "port_id", "city", "state",
            "`Median Age` as median_age",
            "`Male Population` as male_population",
            "`Female Population` as female_population",
            "`Total Population` as total_population",
            "`Average Household Size` as avg_household_size",
            "`Foreign-born` as foreign_born", "`Race` as race", "race_code")

        #apply typing for easy copy into redshift later

        joined_df = joined_df.withColumn("demographics_id",
                                         col("demographics_id").cast(Str()))
        joined_df = joined_df.withColumn("port_id", col("port_id").cast(Str()))
        joined_df = joined_df.withColumn("city", col("city").cast(Str()))
        joined_df = joined_df.withColumn("state", col("state").cast(Str()))
        joined_df = joined_df.withColumn("median_age",
                                         col("median_age").cast(Dec(4, 1)))
        joined_df = joined_df.withColumn("male_population",
                                         col("male_population").cast(Int()))
        joined_df = joined_df.withColumn("female_population",
                                         col("female_population").cast(Int()))
        joined_df = joined_df.withColumn("total_population",
                                         col("total_population").cast(Int()))
        joined_df = joined_df.withColumn(
            "avg_household_size",
            col("avg_household_size").cast(Dec(3, 2)))
        joined_df = joined_df.withColumn("foreign_born",
                                         col("foreign_born").cast(Int()))
        joined_df = joined_df.withColumn("race", col("race").cast(Str()))
        joined_df = joined_df.withColumn("race_code",
                                         col("race_code").cast(Str()))

        #fill nulls with 0
        joined_df = joined_df.fillna(0)
        joined_df.write.mode('overwrite').parquet(dest)