def process_song_data(spark, input_data, output_data):
    """
    Description: This function helps us to read the song_data from S3, put this data into spark dataframe
                 ,extract columns from this dataframe to form "songinf table" and "artist table", transform 
                 "songinf table data" and "artist table data" into a format that this project needs.
    Parameters: -spark: spark session
                -input_data: location of song_data json file (in S3 bucket)
                -output_data: location that the final table will be saved (in S3 bucket)
    Return: None
    """
    #--------------------read song data--------------------#
    print('Read song_data...')
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data_*.json")
    
    # define the song data schema for reading
    SongSchema = R([
                    Fld("artist_id",Str()),
                    Fld("artist_latitude",Doub()),
                    Fld("artist_location",Str()),
                    Fld("artist_longitude",Doub()),
                    Fld("artist_name",Str()),
                    Fld("duration",Doub()),
                    Fld("num_songs",Long()),
                    Fld("song_id",Str()),
                    Fld("title",Str()),
                    Fld("year",Long())
                    ])
    
    # read song data file
    song_df = spark.read.json(song_data, schema=SongSchema)
    
    #--------------------deal with song table--------------------#
    # extract columns to create songinf df
    songinf_df = song_df.select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songinf_df = songinf_df.dropDuplicates(['song_id'])
    songinf_df = songinf_df.dropna(how = "any", subset = ["song_id"])
    songinf_df = songinf_df.filter(songinf_df.song_id != "")
    
    print('Songs table: ')
    print(songinf_df.sort('song_id').show(5))
    
    # write songs table to parquet files partitioned by year and artist
    print('Save Songs table into S3...')
    songinf_df.write.partitionBy("year", "artist_id").parquet("{}/song_table.parquet".format(output_data))

    #--------------------deal with artists table--------------------#
    # extract columns to create artists df
    artist_df = song_df.select(['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude'])
    artist_df = artist_df.dropDuplicates(['artist_id'])
    artist_df = artist_df.dropna(how = "any", subset = ["artist_id"])
    artist_df = artist_df.filter(artist_df.artist_id != "")
    
    print('artists table: ')
    print(artist_df.sort('artist_id').show(5))
    
    # write artists table to parquet files
    print('Save artists table into S3...')
    artist_df.write.parquet("{}/artist_table.parquet".format(output_data))
Exemple #2
0
def process_song_data(spark, input_data, output_data):
    """
    The function loads data from song_data dataset and extract columns
    for songs and artist tables and write the data into parquet
    files which will be loaded on s3.
    
    """
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    # get filepath to song data file
    song_data = 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(os.path.join(input_data, song_data),
                         schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'title', 'artist_id', 'year',
                            'duration').dropDuplicates()

    songs_table.createOrReplaceTempView('songs')

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs/songs.parquet'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude') \
                      .withColumnRenamed('artist_name', 'name') \
                      .withColumnRenamed('artist_location', 'location') \
                      .withColumnRenamed('artist_latitude', 'latitude') \
                      .withColumnRenamed('artist_longitude', 'longitude') \
                      .dropDuplicates()

    artists_table.createOrReplaceTempView('artists')

    # write artists table to parquet files
    artists_table.write.parquet(
        os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
Exemple #3
0
def get_log_schema():
    """
    Creates a schema for log data.
    
    :return: schema
    """
    log_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Str()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Str()),
        Fld("song", Str()),
        Fld("status", Str()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
    return log_schema
Exemple #4
0
def create_log_data():
    """
    Create schema for log data.
    
    return: schema
    """
    log_schema = StructType([
        StructField("artist", Str()), 
        StructField('auth', Str()),
        StructField('firstName', Str()),
        StructField('gender', Str()),
        StructField('itemInSession', Int()),
        StructField('lastName', Str()),
        StructField('length', Dbl()),
        StructField('level', Str()),
        StructField('location', Str()),
        StructField('method', Str()),
        StructField('page', Str()),
        StructField('registration', Dec()),
        StructField('sessionId', Int()),
        StructField('song', Str()),
        StructField('status', Int()),
        StructField('ts', Long()),
        StructField('userAgent', Str()),
        StructField('userId', Int())
    ])
    return log_schema
Exemple #5
0
def process_song_data(spark, input_data_songs, output_data):
    """
    Read song data by providing it an expected schema.
    Create songs and artists tables.
    """
    # define song data schema to improve performance
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    song_data = input_data_songs

    t_start = time()
    dfSongs = spark.read.json(song_data, schema=song_schema)
    t_end = time() - t_start
    print('Read song data in {} secs'.format(t_end))
    dfSongs.printSchema()

    dfSongs.count()
    dfSongs.show(5)

    songs_table = dfSongs.filter(dfSongs.song_id != '')\
                     .select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songs_table.show(5)
    songs_table.write.partitionBy(
        "year",
        "artist_id").mode('overwrite').parquet(output_data +
                                               'songs/songs_table.parquet')

    artists_table = dfSongs.filter(dfSongs.artist_id !='') \
                        .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"),
                                 col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\
                        .dropDuplicates()

    artists_table.show(5)

    artists_table.write.mode('overwrite').parquet(
        output_data + 'artists/artists_table.parquet')
Exemple #6
0
def process_immigration_data(spark, path):
    us_immg_df = spark.read.parquet(path)
    us_immg_df=us_immg_df.select(col("i94res").cast(Int()),col("i94port"),
                           col("arrdate").cast(Int()), \
                           col("i94mode").cast(Int()),col("depdate").cast(Int()),
                           col("i94bir").cast(Int()),col("i94visa").cast(Int()),
                           col("count").cast(Int()), \
                           "gender",col("admnum").cast(Long()))
    us_immg_df = us_immg_df.dropDuplicates()
    travel_mode = get_travel_mode(spark)
    visa_type = get_visa_type(spark)
    ports = get_ports_data(spark)
    us_immg_df = us_immg_df.join(ports,
                                 us_immg_df.i94port == ports.id,
                                 how='left')

    us_immg_df = us_immg_df.withColumn("arrival_date",
                                       get_date(us_immg_df.arrdate))
    us_immg_df = us_immg_df.drop("id")
    get_timestamp(spark, us_immg_df)
def process_log_data(spark, input_data, output_data):
    """
    Description: This function helps us to read the log_data from S3, put this data into spark dataframe
                 ,extract columns from this dataframe to form "user table" and "time table", transform 
                 "user table data" and "time table data" into a format that this project needs.
    Parameters: -spark: spark session
                -input_data: location of log_data json file (in S3 bucket)
                -output_data: location that the final table will be saved (in S3 bucket)
    Return: None
    """
    #--------------------read log data--------------------#
    print('Read log_data...')
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data_*.json")
    
    # define the log data schema for reading
    LogSchema = R([
                    Fld("artist",Str()),
                    Fld("auth",Str()),
                    Fld("firstName",Str()),
                    Fld("gender",Str()),
                    Fld("itemInSession",Long()),
                    Fld("lastName",Str()),
                    Fld("length",Doub()),
                    Fld("level",Str()),
                    Fld("location",Str()),
                    Fld("method",Str()),
                    Fld("page",Str()),
                    Fld("registration",Doub()),
                    Fld("sessionId",Long()),
                    Fld("song",Str()),
                    Fld("status",Long()),
                    Fld("ts",Long()),
                    Fld("userAgent",Str()),
                    Fld("userId",Str()),
                    ])
    
    # read log data file
    log_df = spark.read.json(log_data)

    #--------------------deal with user table--------------------#
    # extract columns for user df and drop the duplicated value and None value
    user_df = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])
    user_df = user_df.dropDuplicates(['userId'])
    user_df = user_df.dropna(how = "any", subset = ["userId"])
    user_df = user_df.filter(user_df.userId != "")
    
    print('User table: ')
    print(user_df.sort('userId').show(5))
        
    # write users table to parquet files
    print('Save User table into S3...')
    user_df.write.parquet("{}/user_table.parquet".format(output_data))

    #--------------------deal with time table--------------------#
    # define convert_timestamp function for convert timestamp format to datetime format
    def convert_timestamp(x):
        datetime_data = datetime.fromtimestamp(x/1000)
        return datetime_data
    
    # register convert_timestamp function for spark session by udf
    convert_timestamp_udf=udf(convert_timestamp, TimestampType())
    
    # extract columns for time df and drop the duplicated value and None value
    time_df = log_df.select(['ts'])
    time_df = time_df.dropDuplicates(['ts'])
    time_df = time_df.dropna(how = "any", subset = ["ts"])
    
    # use udf function defined above to convert the value in "ts column" into datetime format
    time_df = time_df.withColumn('start_time',convert_timestamp_udf('ts'))
    
    # use pyspark.sql.functions to extract hour, day, week, month, year, weekday
    time_df = time_df.withColumn('hour', F.hour('start_time'))
    time_df = time_df.withColumn('day', F.dayofmonth('start_time'))
    time_df = time_df.withColumn('week', F.weekofyear('start_time'))
    time_df = time_df.withColumn('month', F.month('start_time'))
    time_df = time_df.withColumn('year', F.year('start_time'))
    time_df = time_df.withColumn('weekday', F.dayofweek('start_time'))
    
    # select the necessary column for final time table
    time_df = time_df.select(['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'])

    print('Time table: ')
    print(time_df.show(5))

    # write time table to parquet files partitioned by year and month
    print('Save Time table into S3...')
    time_df.write.partitionBy("year", "month").parquet("{}/time_table.parquet".format(output_data))
    
    #--------------------deal with song_play table--------------------#
    # since song_play table need the join result of song_data and log_data, we need to read song_data again.
    song_data = os.path.join(input_data, "song_data/A/*/*/*.json")
    
    # define the song data schema for reading
    SongSchema = R([
                    Fld("artist_id",Str()),
                    Fld("artist_latitude",Doub()),
                    Fld("artist_location",Str()),
                    Fld("artist_longitude",Doub()),
                    Fld("artist_name",Str()),
                    Fld("duration",Doub()),
                    Fld("num_songs",Long()),
                    Fld("song_id",Str()),
                    Fld("title",Str()),
                    Fld("year",Long())
                    ])
    
    # read song data file
    song_df = spark.read.json(song_data, schema=SongSchema)
        
    # filter by actions for song plays
    log_df_filter = log_df.where(log_df.page == 'NextSong')
    
    # create start_time column by using udf function defined above
    log_df_filter = log_df_filter.withColumn('start_time',convert_timestamp_udf('ts'))
    
    # join song_df and log_df_filter
    cond = [log_df_filter.artist == song_df.artist_name, 
            log_df_filter.song == song_df.title,
            log_df_filter.length == song_df.duration]

    songplay_df = log_df_filter.join(song_df, cond) \
                            .select([F.monotonically_increasing_id().alias('songplay_id'),
                                      log_df_filter.start_time,
                                      log_df_filter.userId,
                                      log_df_filter.level,
                                      song_df.song_id,
                                      song_df.artist_id,
                                      log_df_filter.sessionId,
                                      log_df_filter.location,
                                      log_df_filter.userAgent])
    
    print('Song_play table: ')
    print(songplay_df.show(10))
    
    # write songplays table to parquet files partitioned by year and month
    songplay_df.write.partitionBy("year", "month").parquet("{}/songplay_table.parquet".format(output_data))
Exemple #8
0
def process_log_data(spark, input_data, output_data, songs_data):
    """ Process log_data json files which located in S3
        Create table users, time and song_plays
        songs_data will be needed in creation song_plays table
        Store the table in parque format in S3
    
    Args:
      spark                           : Spark Session
      input_data  (string)            : location json files (input)
      output_data (string)            : location parque files (output)
      songs_data  (Spark Dataframe)   : Song Data tables
      
    Returns:
      None
      
    
    """

    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # set schema log data
    logSchema = StructType([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Int()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str()),
    ])

    # read log data file
    df = spark.read.json(log_data, schema=logSchema)

    # filter by actions for song plays
    df = df.filter("page == 'NextSong'")

    # create temporary View for Log
    df.createOrReplaceTempView("logView")

    # extract columns for users table
    users_table = spark.sql("""
        WITH latestChange AS (
            SELECT userId AS userIdLatest,
                   MAX(ts) AS maxTs
            FROM logView
            GROUP BY userId
        )
        SELECT userId AS user_id,
               ts AS tsTemp,
               firstName AS first_name,
               lastName  AS last_name,
               gender,
               level
        FROM logView AS t1
        JOIN latestChange AS t2 
        ON t1.userId = t2.userIdLatest AND t1.ts = t2.maxTs  
        WHERE userId IS NOT NULL
        """).dropDuplicates(['user_id']).drop("tsTemp")

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: getDateTime(int(x)), TST())
    df = df.withColumn("start_time", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: getDateTime(int(x)), Dat())
    df = df.withColumn("date_time", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select(col("start_time"),
                           hour(df.start_time).alias("hour"),
                           dayofmonth(df.date_time).alias("day"),
                           weekofyear(df.date_time).alias("week"),
                           month(df.date_time).alias("month"),
                           year(df.date_time).alias("year"),
                           date_format(df.date_time,"W").alias("weekday")  ) \
                      .where("start_time is not null") \
                      .dropDuplicates(['start_time'])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, 'time'), 'overwrite')

    # create temporary View for Log and Song tables
    df.createOrReplaceTempView("logView")
    songs_data.createOrReplaceTempView("songView")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""
        SELECT start_time,
               year(date_time) AS year,
               month(date_time) AS month,
               userId AS user_id,
               level,
               song_id,
               artist_id,
               sessionId AS session_id,
               location,
               userAgent AS user_agent 
        FROM logView AS t1
        JOIN songView AS t2
        ON  (t1.artist = t2.artist_name)
        AND (t1.song   = t2.title)
        AND (t1.length = t2.duration)
        """)

    songplays_table.show(10)
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, input_data, output_data):
    '''
        Description: This function can be used to load the log data from the input S3 bucket
                     and write the parquet files to the output S3 bucket.
        Arguments:
            spark: SparkSession
            input_data: location for the input data
            output_data: location for the output data
        Returns:
            None
    '''

    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*.json")
    print(log_data)

    logsSchema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Long()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])

    # read log data file
    df = spark.read.json(log_data, schema=logsSchema).distinct()
    #df = spark.read.json(log_data)
    print('df.count', df.count())
    print(df.show(5, truncate=False))
    df.printSchema()

    # filter by actions for song plays
    dfSongPlays = df.filter("page == 'NextSong'")

    # extract columns for users table
    dfSongPlays.createOrReplaceTempView("dfSongPlays")
    users_table = spark.sql(
        "select userId as user_id, firstName as first_name, lastName as last_name, gender, level from dfSongPlays"
    ).distinct()
    print('users_table.count', users_table.count())
    # write users table to parquet files
    users_table.repartitionByRange(
        3, "user_id").write.mode('overwrite').parquet(output_data + "users")

    # create timestamp column from original timestamp column
    dfWithDatetime = dfSongPlays.withColumn(
        'datetime', from_unixtime(dfSongPlays.ts / 1000))
    print('after adding datetime')
    dfWithDatetime.show(5, truncate=False)
    # extract columns to create time table
    dfWithDatetime.createOrReplaceTempView("dfWithDatetime")
    time_table = spark.sql("""
                                select 
                                        ts as start_time, 
                                        hour(datetime) as hour, 
                                        dayofmonth(datetime) as day, 
                                        weekofyear(datetime) as week, 
                                        month(datetime) as month, 
                                        year(datetime) as year, 
                                        dayofweek(datetime) as weekday 
                                    from dfWithDatetime
                                """).distinct()
    time_table.show(5, truncate=False)

    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy(
        "year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    # read song data file
    songsSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])
    song_df = spark.read.json(song_data, schema=songsSchema).distinct()

    # extract columns from joined song and log datasets to create songplays table
    songplays_df = dfSongPlays.join(song_df, (dfSongPlays.artist == song_df.artist_name) & (dfSongPlays.song == song_df.title), how='left') \
                    .withColumn("songplay_id", monotonically_increasing_id()) \
                    .withColumn("year", year(from_unixtime(dfSongPlays.ts/1000))) \
                    .withColumn("month", month(from_unixtime(dfSongPlays.ts/1000))) # Could also use 'left_outer'
    songplays_df.show(5)
    songplays_df.createOrReplaceTempView("songplays_df")
    songplays_table = spark.sql("""
                                    select 
                                            songplay_id, 
                                            ts as start_time, 
                                            userId as user_id, 
                                            level,
                                            song_id, 
                                            artist_id, 
                                            sessionId as session_id, 
                                            location, 
                                            userAgent as user_agent,
                                            year, 
                                            month
                                        from songplays_df
                                """).distinct()

    songplays_table.show(5)

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays_table.write.mode('overwrite').partitionBy(
        "year", "month").parquet(output_data + "songplays")
Exemple #10
0
def process_log_data(spark, input_data, output_data):
    """
    Extract data from log data and write users, time and songplays table
    
    Arguments:
    - spark : SparkSession object
    - input_data : input data root dir path
    - output_data : output data root dir path
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*"

    # read log data file
    df = spark.read.json(log_data)

    
    # filter by actions for song plays
    df = df.where(df.page == "NextSong")

    # extract columns for users table
    df.createOrReplaceTempView("log_data")
    users_table = spark.sql("""
        SELECT lg.userId as user_id, lg.firstName as first_name, lg.lastName as last_name, lg.gender, lg.level FROM log_data lg
        JOIN (
        SELECT userId, MAX(ts) as latest FROM log_data
        GROUP BY userId
        ) max_ts 
        ON lg.userId = max_ts.userId
        WHERE lg.ts = max_ts.latest
    """).dropDuplicates(["user_id"])
    
    # write users table to parquet files
    users_table.write_parquet(output_data + "users_table")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x : (x // 1000), Long())
    df = df.withColumn("unix_timestamp", get_timestamp("ts"))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x : datetime.fromtimestamp(x), Timestamp())
    df = df.withColumn('datetime', get_datetime("unix_timestamp")) 
    
    # extract columns to create time table
    df.createOrReplaceTempView("log_data_time")
    time_table = spark.sql("""
        SELECT unix_timestamp as start_time,
        EXTRACT(hour from datetime) as hour,
        EXTRACT(day from datetime) as day,
        EXTRACT(week from datetime) as week,
        EXTRACT(month from datetime) as month,
        EXTRACT(year from datetime) as year,
        dayofweek(datetime) as weekday
        FROM log_data_time
    """).dropDuplicates(["start_time"])
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(output_data + "time_table")

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + "song_data/*/*/*", schema=songSchema)
    
    # extract columns from joined song and log datasets to create songplays table 
    song_df.createOrReplaceTempView("song_data")
    time_table.createOrReplaceTempView("time_table")
    songplays_table = spark.sql("""
        SELECT unix_timestamp as start_time,
        t.year as year,
        t.month as month,
        userId,
        level,
        song_id,
        artist_id,
        sessionId,
        location,
        userAgent
        FROM log_data_time lg
        LEFT JOIN song_data s ON (lg.song = s.title and lg.artist = s.artist_name)
        LEFT JOIN time_table t ON (lg.unix_timestamp = t.start_time)
    """)

    songplays_table = songplays_table.withColumn('songplay_id', F.monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month

    songplays_table.write.partitionBy("year","month").parquet(output_data + "songplays_table")
Exemple #11
0
def process_log_data(spark, input_data, output_data):
    """
    Process the event log data files from S3 input_data and create and 
    extract `time` table, `users` table and `songplays` table data and 
    write parquet file to S3 output_data.

    :param spark: a SparkSession instance
    :param input_data: input file path
    :param output_data: output file path
    """
    # TODO: get filepath to log data file
    log_data = os.path.join(input_data, "log-data/*/*/*.json")

    # =================================
    # read log data file
    # =================================

    log_schema = StructType([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Long()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Long()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
    logger.info("Start reading `log_data` json file(s)...")

    start_time = time()

    df = spark.read.json(log_data, schema=log_schema).drop_duplicates()

    logger.info("Finished reading 'log_data' json file(s)...")

    logger.info(
        "Reading 'log_data' json took: {0:.2f} seconds".format(time() -
                                                               start_time))

    # TODO: filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # =================================
    # users_table
    # =================================

    # TODO: extract columns for users table
    logger.info("Preparing `users` dataframe")

    user_columns = ["userId", "firstName", "lastName", "gender", "level"]

    users_table = df.select(user_columns)

    logger.info("Start exporting `users` parquet files...")

    start_time = time()

    # write users table to parquet files
    users_table.write \
        .mode("overwrite") \
        .parquet(output_data + "users/")

    logger.info("Finished exporting `users` parquet files")

    logger.info("Exporting `users` parquet files took: {0:.2f} seconds".format(
        time() - start_time))

    # TODO: create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(x / 1000.0),
                        TimestampType())
    df = df.withColumn("ts_timestamp", get_timestamp("ts"))

    # TODO: create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.utcfromtimestamp(x / 1000.0).
                       strftime('%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("ts_datetime", get_datetime("ts"))

    # =================================
    # time_table
    # =================================

    # TODO: extract columns to create time table
    logger.info("Preparing `time` dataframe")

    time_column = [
        "ts", "ts_datetime as start_time", "hour", "day", "week", "month",
        "year", "weekday"
    ]
    time_table = df.withColumn("hour", hour("ts_timestamp")) \
                    .withColumn("day", dayofmonth("ts_timestamp")) \
                    .withColumn("week", weekofyear("ts_timestamp")) \
                    .withColumn("month", month("ts_timestamp")) \
                    .withColumn("year", year("ts_timestamp")) \
                    .withColumn("weekday", dayofweek("ts_timestamp")) \
                    .selectExpr(time_column).drop_duplicates()

    logger.info("Start exporting `time` parquet files...")

    start_time = time()
    # TODO: write time table to parquet files partitioned by year and month
    time_table.write \
        .mode('overwrite') \
        .partitionBy('year', 'month') \
        .parquet(output_data + "time/")

    logger.info("Finished exporting `time` parquet files")

    logger.info("Exporting `time` parquet files took: {0:.2f} seconds".format(
        time() - start_time))
    # =================================
    # songplays_table
    # =================================

    # read in song data to use for songplays table
    song_input = os.path.join(input_data, "song-data/*/*/*/*.json")

    logger.info("Start reading `song_df` json file(s)...")

    song_df = spark.read.json(song_input)

    # TODO: extract columns from joined song and log datasets to create \
    # songplays table join with song_df
    logger.info("Start joining `song_df` and log datasets...")

    songplays_table = df.join(song_df,
                             [song_df.title == df.song,
                             song_df.artist_name == df.artist],
                             how='inner') \
                        .select([monotonically_increasing_id().alias("songplay_id"),
                            col("ts_datetime").alias("start_time"),
                            "userId",
                            "level",
                            "song_id",
                            "artist_id",
                            "sessionId",
                            "location",
                            "userAgent"])

    # TODO: join with time_table to extract month and year
    songplays_table = songplays_table.join(time_table,
                                        [songplays_table.start_time == time_table.start_time],
                                        how='inner')\
                                    .select(
                                        "songplay_id",
                                        songplays_table.start_time,
                                        "userId",
                                        "level",
                                        "song_id",
                                        "artist_id",
                                        "sessionId",
                                        "location",
                                        "userAgent",
                                        "month",
                                        "year"
                                    )

    logger.info("Start exporting `songplays` parquet files...")

    start_time = time()

    # TODO: write songplays table to parquet files partitioned by year and month
    songplays_table.write \
        .mode('overwrite') \
        .partitionBy('year', 'month') \
        .parquet(output_data + "songplays/")

    logger.info("Finished exporting `songplays` parquet files")

    logger.info(
        "Exporting `songplays` parquet files took: {0:.2f} seconds".format(
            time() - start_time))
Exemple #12
0
    Fld("artist_latitude",Dec(10,5)),
    Fld("artist_location",Str()),
    Fld("artist_longitude", Dec(10,5)),
    Fld("artist_name",Str()),
    Fld("duration",Dec(10,5)),
    Fld("num_songs",Int()),
    Fld("song_id", Str()),
    Fld("title",Str()),
    Fld("year",Int()),
])

logSchema = R([
    Fld("artist",Str()),
    Fld("auth", Str()),
    Fld("firstName",Str()),
    Fld("gender", Str()),
    Fld("itemInSession",Int()),
    Fld("lastName",Str()),
    Fld("length",Dec(10,5)),
    Fld("level", Str()),
    Fld("location",Str()),
    Fld("method",Str()),
    Fld("page",Str()),
    Fld("registration", Dbl()),
    Fld("sessionId",Int()),
    Fld("song", Str()),
    Fld("status",Str()),
    Fld("ts",Long()),
    Fld("userAgent",Str()),
    Fld("userId", Str()),
])
Exemple #13
0
def process_log_data(spark, input_data, output_data):
    '''
    load log data in json format from S3 bucket and process these data by extracting 
    users table, time table and songplays table, and save these tables back to S3 bucket
    
    :param spark: spark session
    :param input_data: data location for input data
    :param output_data: data location for output data
    :return: no return value
    '''

    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'
    # log_data = input_data + 'log_data/*.json' # for local files

    logdataSchema = R([
        Fld('artist', Str()),
        Fld('auth', Str()),
        Fld('firstName', Str()),
        Fld('gender', Str()),
        Fld('itemInSession', Long()),
        Fld('lastName', Str()),
        Fld('length', Dbl()),
        Fld('level', Str()),
        Fld('location', Str()),
        Fld('method', Str()),
        Fld('page', Str()),
        Fld('registration', Dbl()),
        Fld('sessionId', Long()),
        Fld('song', Str()),
        Fld('status', Long()),
        Fld('ts', Long()),
        Fld('userAgent', Str()),
        Fld('userId', Str()),
    ])

    # load json files from S3
    df_log = spark.read.json(log_data, schema=logdataSchema)
    df_log = df_log.filter(df_log.page == 'NextSong')

    # select users columns
    users_attr = ['userId', 'firstName', 'lastName', 'gender', 'level']
    users_table = df_log.select(users_attr)\
    .dropDuplicates()

    users_table = users_table\
    .withColumnRenamed('userId','user_id')\
    .withColumnRenamed('firstName','first_name')\
    .withColumnRenamed('lastName','last_name')

    # write users table to S3
    users_table.write.parquet(output_data + 'users/')

    # create time table
    tsFormat = 'yyyy-MM-dd HH:MM:ss z'
    time_table = df_log.withColumn('ts',
                                   to_timestamp(date_format((df_log.ts/1000)\
                                                            .cast(dataType=Tst()),
                                                            tsFormat), tsFormat))

    time_table = time_table.select(
        col('ts').alias('start_time'),
        hour(col('ts')).alias('hour'),
        dayofmonth(col('ts')).alias('day'),
        weekofyear(col('ts')).alias('week'),
        month(col('ts')).alias('month'),
        year(col('ts')).alias('year'))

    # write time table to S3
    time_table.write.partitionBy('year',
                                 'month').parquet(output_data + 'time/')

    # load songs and artist tables from previous handling
    df_songs = spark.read.parquet(output_data + 'songs/*/*/*')
    df_artists = spark.read.parquet(output_data + 'artists/*')
    df_artists = df_artists.drop('location')

    # create songs_logs table
    songs_logs = df_log.join(df_songs, (df_log.song == df_songs.title))

    # create artists_songs_logs table
    artists_songs_logs = songs_logs.join(
        df_artists, (songs_logs.artist == df_artists.name))

    artists_songs_logs = artists_songs_logs\
    .withColumn('ts',
                to_timestamp(date_format((artists_songs_logs.ts/1000)\
                                         .cast(dataType=Tst()),tsFormat), tsFormat))

    # create songplays table
    songplays = artists_songs_logs.join(
        time_table, artists_songs_logs.ts == time_table.start_time, 'left')

    songplays_attr = [
        'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId',
        'location', 'userAgent', 'year', 'month'
    ]

    songplays_table = songplays.select(songplays_attr)\
    .dropDuplicates()

    songplays_table = songplays_table\
    .withColumnRenamed('userId','user_id')\
    .withColumnRenamed('sessionId','session_id')\
    .withColumnRenamed('userAgent','user_agent')\
    .repartition('year', 'month')

    # write songplays table to S3
    songplays_table.write.partitionBy('year', 'month').parquet(output_data +
                                                               'songplays/')
Exemple #14
0
def process_log_data(spark, input_data, output_data):
    """
    The function loads data from log_data dataset and extract columns
    for users and time tables, reads both the log_data and song_data
    datasets and extracts columns for songplays table with the data.
    It writes the data into parquet files which will be loaded on s3.
    Parameters
    
    """

    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    log_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Str()),
        Fld("lastName", Str()),
        Fld("length", Str()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Str()),
        Fld("sessionId", Str()),
        Fld("song", Str()),
        Fld("status", Str()),
        Fld("ts", Str()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])

    # get filepath to log data file
    log_data = 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(os.path.join(input_data, log_data), schema=log_schema)

    # filter by actions for song plays
    actions_df = df.filter(df.page == 'NextSong') \
                   .select('ts', 'userId', 'level', 'song', 'artist',
                           'sessionId', 'location', 'userAgent')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level').dropDuplicates()
    users_table.createOrReplaceTempView('users')

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'),
                              'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    actions_df = actions_df.withColumn('timestamp',
                                       get_timestamp(actions_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts))

    # extract columns to create time table
    time_table = actions_df.select('datetime') \
                           .withColumn('start_time', actions_df.datetime) \
                           .withColumn('hour', hour('datetime')) \
                           .withColumn('day', dayofmonth('datetime')) \
                           .withColumn('week', weekofyear('datetime')) \
                           .withColumn('month', month('datetime')) \
                           .withColumn('year', year('datetime')) \
                           .withColumn('weekday', dayofweek('datetime')) \
                           .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time/time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_data = 'song_data/*/*/*/*.json'
    song_df = spark.read.json(os.path.join(input_data, song_data),
                              schema=song_schema)

    # extract columns from joined song and log datasets to create songplays table
    actions_df = actions_df.alias('log_df')
    song_df = song_df.alias('song_df')
    joined_df = actions_df.join(
        song_df,
        col('log_df.artist') == col('song_df.artist_name'), 'inner')
    songplays_table = joined_df.select(
        col('log_df.datetime').alias('start_time'),
        col('log_df.userId').alias('user_id'),
        col('log_df.level').alias('level'),
        col('song_df.song_id').alias('song_id'),
        col('song_df.artist_id').alias('artist_id'),
        col('log_df.sessionId').alias('session_id'),
        col('log_df.location').alias('location'),
        col('log_df.userAgent').alias('user_agent'),
        year('log_df.datetime').alias('year'),
        month('log_df.datetime').alias('month')) \
        .withColumn('songplay_id', monotonically_increasing_id())

    songplays_table.createOrReplaceTempView('songplays')

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')
Exemple #15
0
def process_log_data(spark, input_data, output_data):
    """
    Reads from log files, 
    transforms them into users, time, and songplays data, 
    and writes them in parquet format. 
    
    params:
    - spark: spark session object
    - input_data: input data path
    - output_data: output data path
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # use schema when read json files
    log_schema = St([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Long()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Long()),
        Fld("song", Str()),
        Fld("status", Long()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])

    # read log data file
    df = spark.read.json(log_data, schema=log_schema)

    # filter by actions for song plays
    df = df.where("page='NextSong'")

    # extract columns for users table
    users_table = df.selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", \
                                "gender", "level").dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users", mode="overwrite")

    # change column name from ts to start_time
    time_table = df.select(col("ts").alias("start_time")).dropDuplicates()

    # convert datatype of start_time into datetime
    get_timestamp = udf(lambda ts: (datetime.fromtimestamp(ts // 1000)), Ts())
    time_table = time_table.withColumn("start_time",
                                       get_timestamp("start_time"))

    # add columns to create time table
    time_table = time_table \
        .withColumn("hour", hour("start_time"))\
        .withColumn("day", date_format("start_time", "dd"))\
        .withColumn("weekofyear", weekofyear("start_time"))\
        .withColumn("month", month("start_time"))\
        .withColumn("year", year("start_time"))\
        .withColumn("weekday", dayofweek("start_time"))

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time", mode="overwrite")

    # read in song data to use for songplays and artists table
    song_df = spark.read.parquet(output_data + "songs")
    artist_df = spark.read.parquet(output_data + "artists")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df \
        .join(song_df, (df.song == song_df.title) & (df.length == song_df.duration))\
        .join(artist_df, song_df.artist_id == artist_df.artist_id)\
        .select(get_timestamp("ts").alias("start_time"),
                col("userId").alias("user_id"),
                df.level,
                song_df.song_id,
                artist_df.artist_id,
                col("sessionId").alias("session_id"),
                df.location,
                col("userAgent").alias("user_agent"))\
        .dropDuplicates()

    # add year and month columns for partitioning
    songplays_table = songplays_table\
        .withColumn("year", year("start_time"))\
        .withColumn("month", month("start_time"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays/songplays",
                                  mode="overwrite",
                                  partitionBy=["year", "month"])
Exemple #16
0
def process_log_data(spark, input_data_logs, output_data):
    """
    Read the log data using the expected schema.
    Create users, time and songplays tables.
    """
    # create log data schema to improve performance
    log_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Long()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Long()),
        Fld("song", Str()),
        Fld("status", Long()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])

    log_data = input_data_logs

    t_start = time()
    dfLogs = spark.read.json(log_data, schema=log_schema)
    t_end = time() - t_start
    print('Read log data in {} secs'.format(t_end))

    dfLogs.printSchema()
    dfLogs.count()
    dfLogs.show(5)

    # filter NextSong records
    dfNextSongLogs = dfLogs.filter(dfLogs.page == 'NextSong')

    users_table = dfNextSongLogs.filter(dfNextSongLogs.userId !='') \
                        .select(col("userId").alias("user_id"),col("firstName").alias("first_name"), col("lastName").alias("last_name"), col("gender"), col("level")) \
                        .dropDuplicates()

    users_table.show(20)

    users_table.write.mode('overwrite').parquet(output_data +
                                                'users/users_table.parquet')
    get_timestamp = udf(lambda ms: datetime.fromtimestamp(ms / 1000.0),
                        TimestampType())
    dfNextSongLogs = dfNextSongLogs.withColumn('start_time',
                                               get_timestamp('ts'))

    time_table = dfNextSongLogs.select('start_time')\
                            .withColumn('hour',hour('start_time')).withColumn('day',dayofmonth('start_time'))\
                            .withColumn('week',weekofyear('start_time')).withColumn('month', month('start_time'))\
                            .withColumn('year', year('start_time')).withColumn('weekday',dayofweek('start_time'))
    time_table.show(5)

    time_table.write.partitionBy(
        "year", "month").mode('overwrite').parquet(output_data +
                                                   'time/time_table.parquet')

    dfSongs = spark.read.parquet(output_data + '/songs/')

    songplays_table = dfNextSongLogs.join(dfSongs, (dfNextSongLogs.song == dfSongs.title) & (dfNextSongLogs.length == dfSongs.duration), 'left_outer')\
        .select(
            dfNextSongLogs.start_time,
            col("userId").alias('user_id'),
            dfNextSongLogs.level,
            dfSongs.song_id,
            dfSongs.artist_id,
            col("sessionId").alias("session_id"),
            dfNextSongLogs.location,
            col("useragent").alias("user_agent"),
            year('start_time').alias('year'),
            month('start_time').alias('month'))\
        .withColumn("idx", monotonically_increasing_id())

    songplays_table = songplays_table.filter(
        "song_id is not null and artist_id is not null")
    songplays_table.show(5)

    songplays_table.write.partitionBy(
        "year",
        "month").mode('overwrite').parquet(output_data +
                                           'songplays/songplays_table.parquet')
Exemple #17
0

if __name__ == "__main__":
    s3_bucket = sys.argv[1]
    s3_key = sys.argv[2]
    aws_key = sys.argv[3]
    aws_secret_key = sys.argv[4]
    redshift_conn_string = sys.argv[5]
    db_user = sys.argv[6]
    db_pass = sys.argv[7]

    spark = create_spark_session(aws_key, aws_secret_key)

    movies_schema = StructType([
        Fld("adult", String()),
        Fld("belongs_to_collection", Long()),
        Fld("budget", Long()),
        Fld("genres", String()),
        Fld("homepage", String()),
        Fld("id", Int()),
        Fld("imdb_id", String()),
        Fld("original_language", String()),
        Fld("original_title", String()),
        Fld("overview", String()),
        Fld("popularity", Dbl()),
        Fld("poster_path", String()),
        Fld("production_company", String()),
        Fld("production_country", String()),
        Fld("release_date", Date()),
        Fld("revenue", Long()),
        Fld("runtime", Float()),
Exemple #18
0
def process_song_data(spark, input_data, output_data):
    """
    Process the songs data files from S3 input_data and create and 
    extract `songs` table and `artist` table data and write parquet file
    to S3 output_data.

    :param spark: a SparkSession instance
    :param input_data: input file path
    :param output_data: output file path
    """
    # TODO: get filepath to song data file
    song_data = os.path.join(input_data, "song-data/*/*/*/*.json")

    # =================================
    # read song data file
    # =================================

    # This schema is based on conducting data profiling
    song_schema = StructType([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long()),
    ])

    logger.info("Start reading `song_data` json file(s)...")
    start_time = time()

    df = spark.read.json(song_data, schema=song_schema).drop_duplicates()

    logger.info("Finished reading 'song_data' json file(s)...")

    logger.info(
        "Reading 'song_data' json took: {0:.2f} seconds".format(time() -
                                                                start_time))

    # =================================
    # songs_table
    # =================================

    # TODO: extract columns to create songs table
    logger.info("Preparing `songs` dataframe")

    song_columns = ["song_id", "title", "artist_id", "year", "duration"]
    songs_table = df.select(song_columns).drop_duplicates()

    logger.info("Start exporting `songs` parquet files...")

    start_time = time()

    # TODO: write songs table to parquet files partitioned by year and artist
    songs_table.write \
        .mode("overwrite") \
        .partitionBy('year', 'artist_id') \
        .parquet(output_data + "songs/")

    logger.info("Finished exporting `songs` parquet files")

    logger.info("Exporting `songs` parquet files took: {0:.2f} seconds".format(
        time() - start_time))

    # =================================
    # artists_table
    # =================================

    # TODO: extract columns to create artists table
    logger.info("Preparing `artists` dataframe")

    artist_columns = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_longitude as longitude", "artist_latitude as latitude"
    ]

    artists_table = df.selectExpr(artist_columns)

    logger.info("Start exporting `artists` parquet files...")

    start_time = time()

    # TODO: write artists table to parquet files
    artists_table.write \
        .mode("overwrite") \
        .parquet(output_data + "artists/")

    logger.info("Finished exporting `artists` parquet files")

    logger.info(
        "Exporting `artists` parquet files took: {0:.2f} seconds".format(
            time() - start_time))
Exemple #19
0
def process_log_data(spark, input_data, output_data):
    """
    Read the songplay log json files into parquet tables
    :param spark: spark session
    :type spark: SparkSession
    :param input_data: path (local or s3) to prefix to log_data root
    :type input_data: str
    :param output_data: path (local or s3) to write output parquet files to
    :type output_data: str
    :return: None
    :rtype: None
    """
    # get filepath to log data file
    
    log_data = input_data + 'log_data/*.json'
    
    log_data_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Decimal()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Int(), nullable=False),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Long(), nullable=False),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
    # read log data file
    df = spark.read.json(log_data, schema=log_data_schema)

    # filter by actions for song plays
    df = df.where(col('page') == "NextSong")

    # extract columns for users table    
    users_table = df.selectExpr('userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level')
    
    # write users table to parquet files
    users_table.write.parquet(output_data + 'users_table')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000.0, Dbl())
    df = df.withColumn('epoch_ts', get_timestamp(df.ts))
        
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimeStamp())
    df = df.withColumn('dt', get_datetime(df.ts))
    
    # extract columns to create time table
    time_table = df.selectExpr('dt as start_time', 
                               'hour(dt) as hour', 
                               'dayofmonth(dt) as day', 
                               'weekofyear(dt) as week',
                               'month(dt) as month',
                               'year(dt) as year',
                               'dayofweek(dt) as weekday')
    
    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + 'time_table')

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    cond = [df.artist == song_df.artist_name, df.song == song_df.title]#, df.length == song_df.duration]

    # Join on artist name and song title match
    songplays_table = df.join(song_df, cond, 'inner').selectExpr('dt as start_time',
                                                                 'userId as user_id', 
                                                                 'level',
                                                                 'song_id',
                                                                 'artist_id',
                                                                 'sessionId as session_id',
                                                                 'location',
                                                                 'userAgent as user_agent'
                                                                ).withColumn('songplay_id', monotonically_increasing_id()) # For autoincrement primary key
    

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + 'songplays_table')