def run_pipeline(name: str, data: str, save: str) -> None:

    spark = SparkSession.builder.appName(name).getOrCreate()

    # Dataset Creation #

    # read bike ride history csv's
    df = spark.read.csv(f'{data}/rides/*', header=True)
    df = df.select(['Duration', 'Start date', 'Start station number', 'Member type'])
    df = df.withColumn('Start station number', df['Start station number'].cast(IntegerType()))
    print(f'The rides dataset has [{df.count()}] rows!')

    # read station information csv
    stations = spark.read.csv(f'{data}/stations/*', header=True)
    print(f'The stations dataset has {stations.count()} rows!')
    stations = stations.withColumnRenamed('LATITUDE', 'start_station_lat')
    stations = stations.withColumnRenamed('LONGITUDE', 'start_station_long')
    stations = stations.withColumn('Start station number', stations['TERMINAL_NUMBER'].cast(IntegerType()))
    stations = stations.select(['start_station_lat', 'start_station_long', 'Start station number'])

    # remove rides longer than 1.5 hours
    one_and_a_half_hours = 60 * 60 * 1.5
    df = df.filter(df['Duration'] <= one_and_a_half_hours)

    # remove rides shorter than 3 minutes
    three_minutes = 60 * 3
    df = df.filter(df['Duration'] >= three_minutes)

    # remove unknown 'Member type's
    df = df.filter(df['Member type'] != 'Unknown')

    # remove non-existent stations
    df = df.filter(~(df['Start station number'] == 31008) & ~(
            df['Start station number'] == 32051) & ~(df['Start station number'] == 32034))

    # make target feature
    df = df.withColumn('label', F.log1p(df.Duration))

    # join on 'Start station number'
    print('Merging rides and stations dataframes!')
    df = df.join(stations, on='Start station number')
    df = df.withColumn('start_station_long', df['start_station_long'].cast(DoubleType()))
    df = df.withColumn('start_station_lat', df['start_station_lat'].cast(DoubleType()))

    print(f'Complete rides and stations dataset has {df.count()} rows!')

    # Feature Transformations #
    print('Doing Feature Transformations!')

    # convert to datetime type
    df = df.withColumn('Start date', F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss'))
    df = df.withColumn('day_of_week', F.dayofweek('Start date'))
    df = df.withColumn('week_of_year', F.weekofyear('Start date'))
    df = df.withColumn('month', F.month('Start date'))
    df = df.withColumn('minute', F.minute('Start date'))
    df = df.withColumn('hour', F.hour('Start date'))

    # make time features cyclical
    pi = 3.141592653589793

    df = df.withColumn('sin_day_of_week', F.sin(2 * pi * df['day_of_week'] / 7))
    df = df.withColumn('sin_week_of_year', F.sin(2 * pi * df['week_of_year'] / 53))
    df = df.withColumn('sin_month', F.sin(2 * pi * (df['month'] - 1) / 12))
    df = df.withColumn('sin_minute', F.sin(2 * pi * df['minute'] / 60))
    df = df.withColumn('sin_hour', F.sin(2 * pi * df['hour'] / 24))

    df = df.withColumn('cos_day_of_week', F.cos(2 * pi * df['day_of_week'] / 7))
    df = df.withColumn('cos_week_of_year', F.cos(2 * pi * df['week_of_year'] / 53))
    df = df.withColumn('cos_month', F.cos(2 * pi * (df['month'] - 1) / 12))
    df = df.withColumn('cos_minute', F.cos(2 * pi * df['minute'] / 60))
    df = df.withColumn('cos_hour', F.cos(2 * pi * df['hour'] / 24))

    df = df.withColumn('hour_and_day_of_week', df['hour'].cast(StringType()) + '_' + df['day_of_week'].cast(StringType()))
    df = df.withColumn('member_type_and_day_of_week', df['Member type'] + '_' + df['day_of_week'].cast(StringType()))

    # drop unused columns
    drop_columns = [
        'Start date',
        'Start station number',
        'Duration',
        'day_of_week',
        'week_of_year',
        'month',
        'minute',
        'hour'
    ]
    df = df.drop(*drop_columns)

    # df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns]).show()

    # Model and Pipeline #

    # split training and test
    train, test = df.randomSplit([.7, .3])

    # encode categorical column 'Member type'
    member_indexer = StringIndexer(inputCol='Member type', outputCol='member_idx')
    member_encoder = OneHotEncoder(inputCol='member_idx', outputCol='member_enc')

    # create vector of features named 'features'
    vector = VectorAssembler(
        inputCols=[
            'start_station_lat',
            'start_station_long',
            'sin_day_of_week',
            'cos_day_of_week',
            'sin_week_of_year',
            'cos_week_of_year',
            'sin_month',
            'cos_month',
            'sin_minute',
            'cos_minute',
            'sin_hour',
            'cos_hour',
            'member_enc'
        ],
        outputCol='features'
    )

    # scale features
    scaler = StandardScaler(
        inputCol='features',
        outputCol='scaled_features'
    )

    # define model
    model = GeneralizedLinearRegression(
        featuresCol='scaled_features'
    )

    # create pipeline and fill in stages
    pipeline = Pipeline(
        stages=[
            member_indexer,
            member_encoder,
            vector,
            scaler,
            model
        ]
    )

    # evaluation method
    evaluation = RegressionEvaluator()

    # best parameter search
    grid = ParamGridBuilder()
    # grid = grid.addGrid(model.maxDepth, [5, 7])
    # grid = grid.addGrid(model.numTrees, [200, 500])
    grid = grid.addGrid(model.maxIter, [40, 50])
    grid = grid.addGrid(model.family, ['gaussian', 'gamma'])
    grid = grid.addGrid(model.regParam, [0.0, 0.1])
    grid = grid.build()

    # run cross validation
    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=grid,
        evaluator=evaluation,
        numFolds=7
    )

    print('Doing Cross Validation!')

    cv_models = cv.fit(train)
    print(f'CV results: {cv_models.avgMetrics} (RMSE)')

    best_model = cv_models.bestModel
    best_params = extract_best_params(best_model.stages[-1].extractParamMap())
    print(f'Best params:\n{best_params}')

    results = cv_models.transform(test)
    print(f'CV results on holdout dataset: {evaluation.evaluate(results)} (RMSE)')

    print('Re-fitting pipeline on entire dataset!')
    cv_models = cv.fit(df)

    print('Saving to pipeline into S3!')
    entire_dataset_best_model = cv_models.bestModel
    entire_dataset_best_model.save(f'{save}/{name}')
    print('Done!')

    return
Ejemplo n.º 2
0
def process_log_data(spark, input_data, output_data):
    """
    This function uses SparkSession to read raw songs data from S3, transform it and save it back to S3
    in the form of three dimension tables: users, time and songplays. The last two are partitioned by year and month.
    """

    # set filepath to folder containing all log data files
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file and filter it by song plays only
    print("Loading log data from " + input_data + "...")
    log_df = spark.read.json(log_data)
    print("Loading complete!")
    log_df = log_df.filter(log_df.page == 'NextSong')

    # extract columns for users table
    users_table = log_df.select(
        ['userId', 'firstName', 'lastName', 'gender',
         'level']).dropDuplicates()

    print("Writing users data into " + output_data)
    # write users table to partitioned parquet files to optimise and save space
    users_table.write.format('parquet').mode('overwrite').save(output_data +
                                                               'users')
    print("Writing complete!")

    # create timestamp column from original unix timestamp column
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                        TimestampType())
    log_df = log_df.withColumn('timestamp', get_timestamp('ts'))

    # create datetime column from original unix timestamp column
    get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                       DateType())
    log_df = log_df.withColumn('datetime', get_datetime('ts'))

    # extract several time measurements from timestamp to create several columns
    time_table = log_df.select(['ts', 'timestamp']).withColumn(
        'year',
        year('timestamp')).withColumn('month', month('timestamp')).withColumn(
            'week', weekofyear('timestamp')).withColumn(
                'weekday', dayofweek('timestamp')).withColumn(
                    'day', dayofmonth('timestamp')).withColumn(
                        'hour', hour('timestamp')).dropDuplicates()

    print("Writing time data into " + output_data)
    # write time table to parquet files partitioned by year and month
    time_table.write.format('parquet').partitionBy(
        ['year', 'month']).mode('overwrite').save(output_data + 'time')
    print("Writing complete!")

    # read songs data from S3 once again
    song_data = input_data + "song-data/*/*/*/*.json"
    print("Loading songs data from " + input_data + " ...")
    songs_df = spark.read.json(song_data)

    # extract columns from both song and log datasets via a SQL query to create the songplays table
    songs_df.createOrReplaceTempView('songs')
    log_df.createOrReplaceTempView('log')
    songplays_table = spark.sql("""
        SELECT
            l.timestamp as start_time,
            year(l.timestamp) as year,
            month(l.timestamp) as month,
            l.userId as user_id,
            l.level as level,
            s.song_id,
            s.artist_id,
            l.sessionId as session_id,
            s.artist_location as location,
            l.userAgent as user_agent
        FROM log l
        JOIN songs s
            ON s.title = l.song
            AND l.artist = s.artist_name
        """).dropDuplicates()

    print("Writing songplay data into " + output_data)
    # write songplays table to partitioned parquet files to optimise and save space
    songplays_table.write.partitionBy([
        'year', 'month'
    ]).format('parquet').mode('overwrite').save(output_data + 'songplays')
    print("Writing complete!")
Ejemplo n.º 3
0
        month(datetime) AS month,
        year(datetime) AS year,
        dayofweek(datetime) AS weekday
    FROM staging_events
    ORDER BY start_time
""")

time_table.printSchema()


# +
time_table = df2.select(
    col('datetime').alias('start_time'),
    hour('datetime').alias('hour'),
    dayofmonth('datetime').alias('day'),
    weekofyear('datetime').alias('week'),
    month('datetime').alias('month'),
    year('datetime').alias('year'),
    dayofweek('datetime').alias('weekday')
)
time_table = time_table.drop_duplicates(subset=['start_time'])

time_table.printSchema()

# +
# time_table.select('hour').limit(1).show(1)

# +
# s_s3_path = "{}/time.parquet".format(output_data)
# time_table.write.mode('overwrite').partitionBy(
#     'year', 'month').parquet(s_s3_path)
Ejemplo n.º 4
0
def process_log_data(spark, input_data, output_data):
    '''
    Description: process_song_data function loads data from a s3 bucket from input_data, process all the data and
    stores the results in another s3 bucket located in output_data
    
    Parameters:
    spark       : Spark Session
    input_data  : s3 Address where song_data json files are stored (in this specific scenario, a public s3 bucket)
    output_data : S3 bucket were dimensional tables in parquet format will be stored after processed   
    '''

    # get filepath to log data file
    log_data = input_data + 'data/log-data/*'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.selectExpr('userId AS user_id', 'firstName AS first_name',
                                'lastName AS last_name', 'gender',
                                'level').drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/')

    # create timestamp column from original timestamp column

    df = df.withColumn(
        'start_time',
        to_timestamp(
            from_unixtime((col("ts") / 1000),
                          'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp"))

    # extract columns to create time table
    time_table = df.select('start_time',
                           hour('start_time').alias('hour'),
                           dayofmonth('start_time').alias('day'),
                           weekofyear('start_time').alias('week'),
                           month('start_time').alias('month'),
                           year('start_time').alias('year'),
                           dayofweek('start_time').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'time/')

    # read in song data to use for songplays table (reading the parquet files stored before)
    song_df = spark.read.parquet(
        output_data + 'songs/*/*/*'
    )  # Partitioned by year and artist_id, only two levels to read

    # extract columns from joined song and log datasets to create songplays table
    # Final Columns - songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
    # Creating an id for songplays_table ->
    songplays_table =  df.withColumn('songplay_id', monotonically_increasing_id()) \
                         .join(song_df, song_df.title == df.song) \
                         .select('songplay_id',
                                 'start_time',
                                 col('userId').alias('user_id'),
                                 'level',
                                 'song_id',
                                 col('artist').alias('artist_id'),
                                 col('sessionID').alias('session_id'),
                                 'location',
                                 col('userAgent').alias('user_agent'),
                                month('start_time').alias('month'),
                                 year('start_time').alias('year'))

    # write songplays table to parquet files
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
Ejemplo n.º 5
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    log_data_raw = spark.read.json(log_data)

    # filter by actions for song plays
    log_data_stage = log_data_raw.filter(log_data_raw.page == 'NextSong')

    # extract columns for users table
    users_table = log_data_stage.select(
        col('userId').alias('user_id'),
        col('firstName').alias('firt_name'),
        col('lastName').alias('last_name'), col('gender'),
        col('level')).distinct()

    users_table.show(5, truncate=False)

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000))
    log_data_stage = log_data_stage.withColumn(
        'timestamp', get_timestamp(log_data_stage.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    log_data_stage = log_data_stage.withColumn('date',
                                               get_datetime(log_data_stage.ts))

    # extract columns to create time table
    time_table = log_data_stage.select(
        col('date').alias('start_time'),
        hour('date').alias('hour'),
        dayofmonth('date').alias('day'),
        weekofyear('date').alias('week'),
        month('date').alias('month'),
        year('date').alias('year'),
        date_format('date',
                    'EEEE').alias('weekday')).distinct().orderBy('start_time')

    time_table.show(5, truncate=False)

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time'), 'overwrite')

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
    songs_stage = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = log_data_stage.join(
        songs_stage, (log_data_stage.artist == songs_stage.artist_name) &
        (log_data_stage.song == songs_stage.title))
    songplays_table = songplays_table.withColumn('songplay_id',
                                                 monotonically_increasing_id())
    songplays_table = songplays_table.select(
        col('songplay_id'),
        col('date').alias('start_time'),
        year('date').alias('year'),
        month('date').alias('month'),
        col('userId').alias('user_id'), col('level'), col('song_id'),
        col('artist_id'),
        col('sessionId').alias('session_id'), col('location'),
        col('userAgent').alias('user_agent')).orderBy('user_id', 'session_id')

    songplays_table.show(5, truncate=False)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays'), 'overwrite')
Ejemplo n.º 6
0
def process_log_data(spark, input_data, output_data):
    """
    Transform raw log data from S3 into analytics tables on S3
    
    This function reads in log data in JSON format from S3; defines the schema
    of songplays, users, and time analytics tables; processes the raw data into
    those tables; and then writes the tables into partitioned parquet files on
    S3.
    
    Args:
        spark: a Spark session
        input_data: an S3 bucket to read log data in from
        output_data: an S3 bucket to write analytics tables to
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    log_data_schema = StructType([
        StructField("artist", StringType(), True),
        StructField("auth", StringType(), False),
        StructField("firstName", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("itemInSession", IntegerType(), False),
        StructField("lastName", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("level", StringType(), False),
        StructField("location", StringType(), True),
        StructField("method", StringType(), False),
        StructField("page", StringType(), False),
        StructField("registration", DoubleType(), True),
        StructField("sessionId", IntegerType(), False),
        StructField("song", StringType(), True),
        StructField("status", IntegerType(), False),
        StructField("ts", DoubleType(), False),
        StructField("userAgent", StringType(), True),
        StructField("userId", StringType(), True)
    ])
    df = spark.read.json(log_data, schema=log_data_schema)

    # filter by actions for song plays
    df = df.filter(col("page") == "NextSong")

    # extract columns for users table
    users_table = (df.withColumn(
        "max_ts_user",
        max("ts").over(Window.partitionBy("userID"))).filter(
            (col("ts") == col("max_ts_user")) & (col("userID") != "")
            & (col("userID").isNotNull())).select(
                col("userID").alias("user_id"),
                col("firstName").alias("first_name"),
                col("lastName").alias("last_name"), "gender", "level"))

    # write users table to parquet files
    users_table.write.parquet(output_data + "users_table.parquet",
                              mode="overwrite")

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).replace(microsecond=0),
        TimestampType())
    df = df.withColumn("start_time", get_datetime("ts"))

    # extract columns to create time table
    time_table = (df.withColumn("hour", hour("start_time")).withColumn(
        "day", dayofmonth("start_time")).withColumn(
            "week", weekofyear("start_time")).withColumn(
                "month", month("start_time")).withColumn(
                    "year", year("start_time")).withColumn(
                        "weekday", dayofweek("start_time")).select(
                            "start_time", "hour", "day", "week", "month",
                            "year", "weekday").distinct())

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time_table.parquet",
                             mode="overwrite",
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs_table.parquet")

    # extract columns from joined song and log datasets to create songplays table
    artists_table = spark.read.parquet(output_data + "artists_table.parquet")
    songs = (song_df.join(artists_table, "artist_id",
                          "full").select("song_id", "title", "artist_id",
                                         "name", "duration"))
    songplays_table = df.join(songs, [
        df.song == songs.title, df.artist == songs.name, df.length
        == songs.duration
    ], "left")
    songplays_table = (songplays_table.join(
        time_table, "start_time",
        "left").select("start_time",
                       col("userId").alias("user_id"), "level", "song_id",
                       "artist_id",
                       col("sessionId").alias("session_id"), "location",
                       col("userAgent").alias("user_agent"), "year",
                       "month").withColumn("songplay_id",
                                           monotonically_increasing_id()))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays_table.parquet",
                                  mode="overwrite",
                                  partitionBy=["year", "month"])
display()

# COMMAND ----------

# MAGIC %md ## Feature Engineering
# MAGIC In the remaining part of the Notebook #1, we will demonstrate how to generate new features for this kind of use case. It is definitely not meant to be a comprehensive list.

# COMMAND ----------

# MAGIC %md In the following cell, we created some time features, calculated the total number of warning_type1 (type2) occured for a macine on a particular day. We also identified some data quality issue that some event counts had negative values. 

# COMMAND ----------

# Extract some time features from "date" column
df = df.withColumn('month', month(df['date']))
df = df.withColumn('weekofyear', weekofyear(df['date']))
df = df.withColumn('dayofmonth', dayofmonth(df['date']))


# warning related raw features
warning_type1_features = list(s for s in df.columns if "warning_1_" in s) 
                            
warning_type2_features = list(s for s in df.columns if "warning_2_" in s)

warning_all = warning_type1_features + warning_type2_features

# total count of all type1 warnings each day each device
df = df.withColumn('warn_type1_total', sum(df[col_n] for col_n in warning_type1_features))
# total count of all type2 warnings each day each device
df = df.withColumn('warn_type2_total', sum(df[col_n] for col_n in warning_type2_features))
Ejemplo n.º 8
0
def process_log_data(spark, input_data, output_data):
    """import log dataset extract columns create users, time, and songplays table
    
    Parameters:
    spark: name of spark session
    input_data: location of the source data s3 bucket 
    output_data: location of the destination data s3 bucket
    
    Returns:
    writes users table in parquet to output_data location + users
    writes time_table in parquet to output_data location + time
    writes songplays table in parquest to output_data location + songplays    
    
    """
    """Setting up the JSON table structure for the log dataset"""
    log_dataset_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("iteminSession", Lng()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Lng()),
        Fld("song", Str()),
        Fld("status", Lng()),
        Fld("ts", Lng()),
        Fld("userAgent", Str()),
        Fld("userId", Str()),
    ])

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data, schema=log_dataset_schema)

    # filter by actions for song plays
    df = df.filter(col("page") == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        col("userId").alias("user_id"),
        col("firstName").alias("first_name"),
        col("lastName").alias("last_name"), "gender", "level")

    # drop duplicate rows in users table
    users_table = users_table.dropDuplicates()

    # write users table to parquet files
    users_table.write.mode('append').parquet(output_data + "users")

    # create timestamp column from original timestamp column
    dfTimestamp = df.withColumn("start_time", to_timestamp(col("ts") / 1000))
    dfTimestamp = dfTimestamp \
    .withColumn("hour", hour("start_time")) \
    .withColumn("day", dayofmonth("start_time")) \
    .withColumn("week", weekofyear("start_time")) \
    .withColumn("month", month("start_time")) \
    .withColumn("year", year("start_time")) \
    .withColumn("weekday", date_format("start_time", 'E'))

    # extract columns to create time table
    time_table = dfTimestamp.select(col("start_time"), \
                                    col("hour"), \
                                    col("day"), \
                                    col("week"), \
                                    col("month"), \
                                    col("year"), \
                                    col("weekday"))

    # drop duplicate rows in time table
    time_table = time_table.dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.mode('append').partitionBy(
        "year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.join(df, song_df.artist_name==df.artist). \
        withColumn("songplay_id", monotonically_increasing_id()). \
        withColumn('start_time', to_timestamp(col("ts") /1000 )). \
        select("songplay_id",
        "start_time",
        col("userId").alias("user_id"),
        "level",
        "song_id",
        "artist_id",
        col("sessionId").alias("session_id"),
        col("artist_location").alias("location"),
        "userAgent",
        month(col("start_time")).alias("month"),
        year(col("start_time")).alias("year"))

    # drop duplicate rows in songplays table
    songplays_table = songplays_table.dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('append').partitionBy(
        'year', 'month').parquet(output_data + "songplays")
Ejemplo n.º 9
0
def process_log_data(spark, input_data, output_data):
    ''' Function to read log data from s3 bucket,performs transformations and loads to target tables '''
    
    # get filepath to log data file
    log_data = os.path.join(input_data,"log_data/*/*/*.json")

    # read log data file
    df_log = spark.read.json(log_data)
    
    # filter by actions for song plays and also filter the records where userId is null
    df_log = df_log.filter(col("page")=='NextSong').filter(df_log.userId.isNotNull())

    # extract columns for users table    
    users_table = df_log.select(col("userId").alias("user_id"),col("firstName").alias("first_name"),\
                        col("lastName").alias("last_name"), "gender", "level").dropDuplicates()
    
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df_log = df_log.withColumn("timestamp",get_timestamp(col("ts")))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df_log = df_log.withColumn("datetime", get_datetime(col("ts")))
    
    # extract columns to create time table
    time_table = df_log.select(
         col('datetime').alias('start_time'),
         hour('datetime').alias('hour'),
         dayofmonth('datetime').alias('day'),
         weekofyear('datetime').alias('week'),
         month('datetime').alias('month'),
         year('datetime').alias('year'),
         date_format('datetime', 'F').alias('weekday')
     )  
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year','month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite')
    
    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    song_df = spark.read.json(song_data) 

    # extract columns from joined song and log datasets to create songplays table 
    df_songplay = song_df.join(df_log,
                           (song_df.artist_name == df_log.artist) &
                           (song_df.title == df_log.song) &
                           (song_df.duration == df_log.length))
    
    songplays_table = df_songplay.select(
        monotonically_increasing_id().alias('songplay_id'),
        col('datetime').alias('start_time'),
        col('level').alias('level'),
        col('userId').alias('user_id'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('sessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        col('year').alias('year'),
        month('datetime').alias('month')
        ).dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite')
Ejemplo n.º 10
0
def process_log_data(spark, input_data, output_data):
    """
    Process log data from json files and create users, time, and songplays tables in parquet.
    """
    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json("{}log_data/*/*/*.json".format(log_data))

    # filter by actions for song plays
    df = df.filter(df['page'] == "NextSong")

    # extract columns for users table
    users_table = df.withColumn("last_stamp", max_(col('ts')).over(Window.partitionBy("userId"))) \
        .filter(col('ts') == col('last_stamp')) \
        .select('userId', 'firstName',
                'lastName', 'gender', 'level')

    # write users table to parquet files
    users_table.write.parquet(os.path.join(
        output_data, 'users'), mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(
        x/1000.0), TimestampType())
    df = df.withColumn("timestamp", get_timestamp(col('ts')))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x/1000.0), DateType())
    df = df.withColumn("datetime", get_datetime(col('ts')))

    # extract columns to create time table
    time_table = df.withColumn('hour', hour(df.timestamp)) \
        .withColumn('day', dayofmonth(df.timestamp)) \
        .withColumn('week', weekofyear(df.timestamp)) \
        .withColumn('month', month(df.timestamp)) \
        .withColumn('year', year(df.timestamp)) \
        .withColumn('weekday', date_format('timestamp', 'u')) \
        .select('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday').distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time_tbl'), mode='overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, 'songs'))

    artist_df = spark.read.parquet(os.path.join(output_data, 'artists'))

    # extract columns from joined song and log data`sets to create songplays table
    songplays_table = df.join(song_df, (df.song == song_df.title)
                              & (df.length == song_df.duration), 'left_outer') \
                        .join(artist_df, (song_df.artist_id == artist_df.artist_id)
                              & (df.artist == artist_df.artist_name), 'left_outer') \
                        .select(
                            df.timestamp.alias("start_time"),
                            df.userId.alias("user_id"),
                            df.level, song_df.song_id,
                            song_df.artist_id, df.sessionId.alias(
                                "session_id"),
                            df.location, df.userAgent.alias("user_agent")
    ).withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.join(time_table, (songplays_table.start_time == time_table.timestamp)) \
                   .select(songplays_table["*"], time_table.year, time_table.month) \
                   .write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays'), mode='overwrite')
Ejemplo n.º 11
0
def process_log_data(spark, input_data, output_data):
    """
    :param spark: spark session to process data
    :param input_data: s3 bucket where contains log data files
    :param output_data: file path where result of this function will save files
    """

    # get filepath to log data file
    log_data = f"{input_data}{default_log_subpath}"

    # read log data file
    df = spark.read.json(log_data, mode='PERMISSIVE').drop_duplicates()

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = (df.select("userId", "firstName", "lastName", "gender",
                             "level").drop_duplicates())

    # write users table to parquet files
    users_table.write.parquet(os.path.join(f"{output_data}/users/"),
                              mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))

    # create datetime column from original timestamp column
    get_date = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                   DateType())
    df = df.withColumn("start_date", get_date("ts"))

    # create year and month columns to use as partition key in songplays_table
    df = (df.withColumn("datetime_year",
                        year("start_time")).withColumn("datetime_month",
                                                       month("start_time")))

    # extract columns to create time table
    time_table = (df.withColumn("hour", hour("start_time")).withColumn(
        "day", dayofmonth("start_time")).withColumn(
            "week", weekofyear("start_time")).withColumnRenamed(
                "datetime_year",
                "year").withColumnRenamed("datetime_month", "month").select(
                    "start_time", "hour", "day", "week", "month",
                    "year").drop_duplicates())

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(os.path.join(f"{output_data}/time_table/"),
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = (spark.read.format("parquet").option(
        "basePath", os.path.join(output_data, "songs/")).load(
            os.path.join(f"{output_data}/songs{default_log_subpath}/")))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = (df.join(
        song_df, df.song == song_df.title,
        how='inner').select(monotonically_increasing_id().alias("songplay_id"),
                            df.start_time, df.userId, df.level,
                            song_df.song_id, song_df.artist_id, df.sessionId,
                            df.location, df.userAgent, df.datetime_year,
                            df.datetime_month))

    # write songplays table to parquet files partitioned by year and month
    (songplays_table.withColumnRenamed(
        "datetime_year",
        "year").withColumnRenamed("datetime_month", "month").withColumnRenamed(
            "userId", "user_id").withColumnRenamed(
                "sessionId", "session_id").withColumnRenamed(
                    "userAgent", "user_agent").drop_duplicates().write.parquet(
                        os.path.join(f"{output_data}/songplays/"),
                        mode="overwrite",
                        partitionBy=["year", "month"]))

    return df, users_table, time_table, song_df, songplays_table
Ejemplo n.º 12
0
def process_log_data(spark, song_input_data, log_input_data, output_data):
    """
    Reads JSON files located at log data directory.
    Filters log data to only contain song plays (page = NextSong)
    Extracts relevent columns for "users", removes duplicates.    
    Writes user data.     
    Computes timestamp and datetime data from original timestamp, and adds to dataframe.
    Extracts datetime column, and decomposes into date heirarchy to create new "time" table.
    Writes time data, partitioned by year and month.    
    Reads JSON files located at song data directory.
    Joins log data frame to songs data frame, on song title and artist name match.
    Forms somngplays table, creating aliases for column names, and adding unique primary key to each record.
    Writes songplay data, partitioned by year and month. 
    """
    # get filepath to log data file
    log_data = log_input_data

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']
    users_table = users_table.dropDuplicates(['userId', 'level'])

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'),
                              'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    df = df.withColumn('datetime', get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select('datetime') \
                           .withColumn('start_time', df.datetime) \
                           .withColumn('hour', hour('datetime')) \
                           .withColumn('day', dayofmonth('datetime')) \
                           .withColumn('week', weekofyear('datetime')) \
                           .withColumn('month', month('datetime')) \
                           .withColumn('year', year('datetime')) \
                           .withColumn('weekday', dayofweek('datetime'))
    time_table = time_table.dropDuplicates(['datetime'])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time/time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.json(song_input_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_df = df.join(song_df, (df.artist == song_df.artist_name) &
                           (df.song == song_df.title))
    songplays_df = songplays_df.withColumn('month', month('datetime'))
    songplays_df = songplays_df.withColumn('songplay_id',
                                           monotonically_increasing_id())

    songplays_table = songplays_df.select(
        col('songplay_id').alias('songplay_id'),
        col('datetime').alias('start_time'),
        col('userId').alias('user_id'),
        col('level').alias('level'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('sessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        col('year').alias('year'),
        col('month').alias('month'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')
    print("Songplays table written.")
Ejemplo n.º 13
0
def process_log_data(spark, input_data, output_data):
    """This function will get the data from Udacity s3 bucket available for this project. Extract data from the log_data path, select the columns that the project requires and create the output tables in parquet files for the artist and song table."""

    # get filepath to log data file
    log_data = input_data + "log-data"

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(col("page")=='NextSong').filter(df.userId.isNotNull())

    # extract columns for users table    
    users_table = df.select(col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level").dropDuplicates()
    
    # write users table to parquet files
    print("""##### [STARTING] Writing table to the parquet files: 
                   USERS #####
                   """)
    users_table.write.mode("overwrite").parquet(output_data+"users")
    print("""##### [FINISHED] Table USERS already loaded #####
    """)  

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn("timestamp",get_timestamp(col("ts"))) 
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(col("ts"))) 
    
    # extract columns to create time table
    time_table = df.select(
         'timestamp',
         hour('datetime').alias('hour'),
         dayofmonth('datetime').alias('day'),
         weekofyear('datetime').alias('week'),
         month('datetime').alias('month'),
         year('datetime').alias('year'),
         date_format('datetime', 'F').alias('weekday')
     )
    
    # write time table to parquet files partitioned by year and month
    print("""##### [STARTING] Writing table to the parquet files: 
                   TIME #####
                   """)
    time_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"time")
    print("""##### [FINISHED] Table TIME already loaded #####
    """)  

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    # Creating a string variable with the timestamp format
    tsFormatVar = "yyyy/MM/dd HH:MM:ss z"

    '''In this part, the songplays table are made with a join between 2 dataframes, and after that the columns select with the transformation (if the transformation applies)'''      
    songplays_table = song_df.join(df,(song_df.artist_name==df.artist) & (song_df.title==df.song)).withColumn("songplay_id",monotonically_increasing_id()).withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormatVar),tsFormatVar)).select("songplay_id","start_time",col("userId").alias("user_id"),"level","song_id","artist_id",col("sessionId").alias("session_id"),col("artist_location").alias("location"),"userAgent",month(col("start_time")).alias("month"),year(col("start_time")).alias("year"))
    


    # write songplays table to parquet files partitioned by year and month
    print("""##### [STARTING] Writing table to the parquet files: 
                   SONGPLAYS #####
                   """)
    songplays_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"songplays")
    print("""##### [FINISHED] Table SONGPLAYS already loaded #####
    """)  
def process_log_data(spark, input_data, output_data):
    """
    Description:
            Process the event log file and extract data for table time, users and songplays from it.
    :param spark: a spark session instance
    :param input_data: input file path
    :param output_data: output file path
    """

    # get filepath to log data file
    log_data = os.path.join(input_data, "log-data/")

    # read log data file
    df = spark.read.json(
        log_data,
        mode='PERMISSIVE',
        columnNameOfCorruptRecord='corrupt_record').drop_duplicates()

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, "users/"),
                              mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    time_table = df.withColumn("hour",hour("start_time"))\
                    .withColumn("day",dayofmonth("start_time"))\
                    .withColumn("week",weekofyear("start_time"))\
                    .withColumn("month",month("start_time"))\
                    .withColumn("year",year("start_time"))\
                    .withColumn("weekday",dayofweek("start_time"))\
                    .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(os.path.join(output_data, "time_table/"),
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.read\
                .format("parquet")\
                .option("basePath", os.path.join(output_data, "songs/"))\
                .load(os.path.join(output_data, "songs/*/*/"))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
                        .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent"))

    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\
                        .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.drop_duplicates().write.parquet(
        os.path.join(output_data, "songplays/"),
        mode="overwrite",
        partitionBy=["year", "month"])
Ejemplo n.º 15
0
def process_log_data(spark, input_data, output_data):
    """
    Process log data in s3 bucket by extracting users table, time table
    and songplays table. Normalize and transform data to be store as
    parquet files in s3 bucket
    :param spark: a spark session
    :param input_data: data from s3 bucket location
    :param output_data: data to s3 bucket location
    :return: None
    """
    # get filepath to log data file
    log_data = input_data + "data/log_data/*.json"

    # setting schema types for log data fields
    log_data_schema = StructType([
        StructField("artist", StringType(), True),
        StructField("auth", StringType(), True),
        StructField("firstName", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("itemInSession", LongType(), True),
        StructField("lastName", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("level", StringType(), True),
        StructField("location", StringType(), True),
        StructField("method", StringType(), True),
        StructField("page", StringType(), True),
        StructField("registration", DoubleType(), True),
        StructField("sessionId", LongType(), True),
        StructField("song", StringType(), True),
        StructField("status", LongType(), True),
        StructField("ts", LongType(), True),
        StructField("userAgent", StringType(), True),
        StructField("userId", StringType(), True),
    ])

    # read log data file
    log_df = spark.read.json(log_data, schema=log_data_schema)

    # columns to be used
    firstName = 'firstName'
    gender = 'gender'
    lastName = 'lastName'
    level = 'level'
    userId = 'userId'
    start_time = 'start_time'
    hour = 'hour'
    day = 'day'
    week = 'week'
    month = 'month'
    year = 'year'
    weekday = 'weekday'

    # filter by actions for song plays
    log_df = log_df.filter(log_df.page == 'NextSong')

    # extract columns for users table
    users_table = log_df.select(firstName, lastName, gender, level, userId)

    # write users table to parquet files
    users_table.write.parquet(f"{output_data}users_table.parquet")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000),
                        TimestampType())
    log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: F.to_date(x), TimestampType())
    log_df = log_df.withColumn("start_time", get_datetime(log_df.ts))

    # extract columns to create time table
    log_df = log_df.withColumn("hour", F.hour("timestamp"))
    log_df = log_df.withColumn("day", F.dayofweek("timestamp"))
    log_df = log_df.withColumn("week", F.weekofyear("timestamp"))
    log_df = log_df.withColumn("month", F.month("timestamp"))
    log_df = log_df.withColumn("year", F.year("timestamp"))
    log_df = log_df.withColumn("weekday", F.dayofweek("timestamp"))

    time_table = log_df.select(start_time, hour, day, week, month, year,
                               weekday)

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(
        "year", "month").parquet(f"{output_data}time_table.parquet")

    # read in song data to use for songplays table
    song_data = input_data + "data/song_data/song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    time_table.createOrReplaceTempView("time_table")
    log_df.createOrReplaceTempView("log_df_table")
    song_df.createOrReplaceTempView("song_df_table")
    songplays_table = spark.sql("""
       SELECT DISTINCT lt.start_time,
                        lt.userId,
                        lt.level,
                        lt.sessionId,
                        lt.location,
                        lt.userAgent,
                        st.song_id,
                        st.artist_id,
                        tt.year,
                        tt.month
        FROM log_df_table lt 
        INNER JOIN song_df_table st 
        ON st.artist_name = lt.artist 
        INNER JOIN time_table tt
        ON tt.start_time = lt.start_time 
    """)

    # create songplays_id columns which increases monotonically
    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(
        "year", "month").parquet(f"{output_data}songplays_table.parquet")
Ejemplo n.º 16
0
def process_log_data(spark, input_data, output_data):
    """Process log_data
    
    Process log_data from the input path(local or S3)
    Create dimension tables for users, time and songplays fact table
    
    Args:
        spark: SparkSession
        input_data: Input Path (local or S3)
        output_data: Output path (local or S3)
    Returns:
        None 
    """

    print("Loading log_data")
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*/*/*.json")

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(col("page") == "NextSong")
    print("Loaded %d log actions " % df.count())

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender", "level","ts") \
        .orderBy("ts",ascending=False) \
        .dropDuplicates(subset=["userId"]) \
        .drop('ts')

    print("Writing users data")
    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(
        os.path.join(output_data, "users/"))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
                        TimestampType())

    # build start_time column and year and month for partitioning
    df = df.withColumn("start_time", get_timestamp(df.ts)) \
        .withColumn("month", month(col("start_time"))) \
        .withColumn("year", year(col("start_time")))

    # extract columns to create time table
    time_table = df.select("start_time", "month", "year").dropDuplicates() \
        .withColumn("hour", hour(col("start_time"))) \
        .withColumn("day", dayofmonth(col("start_time"))) \
        .withColumn("week", weekofyear(col("start_time"))) \
        .withColumn("weekday", dayofweek(col("start_time")))

    print("Writing time data")
    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite") \
        .partitionBy("year", "month") \
        .parquet(os.path.join(output_data, 'time/'))

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data,
                                              'songs/')).withColumnRenamed(
                                                  "year", "song_year")

    print("Joining logs with songs data")
    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, (df.artist == song_df.artist_name) &
                              (df.song == song_df.title) &
                              (df.length == song_df.duration),
                              how='inner')

    print("Loaded %d songplays " % songplays_table.count())

    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())

    print("Writing songplays data")
    # write songplays table to parquet files partitioned by year and month
    songplays_table.selectExpr( [
         "songplay_id",
         "start_time",
         "userId as user_id",
         "level",
         "song_id",
         "artist_id",
         "sessionId as session_id",
         "location",
         "userAgent as user_agent",
         "year",
         "month" ]) \
        .write.mode("overwrite") \
        .partitionBy("year", "month") \
        .parquet(output_data + "songplays/")
Ejemplo n.º 17
0
def process_log_data(spark, input_data, output_data):
    """
        Description: This function loads log_data from S3 and processes it by extracting the songs and artist tables
                    and then again loaded back to S3. Also output from previous function is used in by spark.read.json command
        
        Parameters:
            spark       : Spark Session
            input_data  : location of log_data json files with the events data
            output_data : S3 bucket were dimensional tables in parquet format will be stored
            
    """
    # get filepath to log data file
    log_data = os.path.join(input_data,"log_data/*/*/*.json")

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    songplays_table = df['ts', 'userId', 'level','sessionId', 'location', 'userAgent']

    # extract columns for users table    
    users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']
    users_table = users_table.dropDuplicates(['userId'])
    
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite')
    print("users.parquet completed")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x)/1000)))
    df = df.withColumn('timestamp', get_timestamp(df.ts))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(df.ts))
    
    # extract columns to create time table
    time_table = df.select(
        col('datetime').alias('start_time'),
        hour('datetime').alias('hour'),
        dayofmonth('datetime').alias('day'),
        weekofyear('datetime').alias('week'),
        month('datetime').alias('month'),
        year('datetime').alias('year') 
   )
    time_table = time_table.dropDuplicates(['start_time'])
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite')
    print("time.parquet completed")

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song-data/A/A/A/*.json")
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    df = df.join(song_df, song_df.title == df.song)
    
    songplays_table = df.select(
        col('ts').alias('ts'),
        col('userId').alias('user_id'),
        col('level').alias('level'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('ssessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        col('year').alias('year'),
        month('datetime').alias('month')
    )
    
    songplays_table = songplays_table.selectExpr("ts as start_time")
    songplays_table.select(monotonically_increasing_id().alias('songplay_id')).collect()
    
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite')
    print("songplays.parquet completed")
    print("process_log_data completed")
Ejemplo n.º 18
0
def process_log_data(spark, input_data, output_data):
    """
    Process the log dataset and create user table, time table and songsplat table
    :param spark: SparkSession
    :param input_data: path/to/files to process
    :param output_data: path/to/files to write the results Datasets
    
    """
    # get filepath to log data file
    print('Reading Log Data')
    log_data = input_data + 'log_data/*.json'

    # read log data file
    df = spark.read.json(log_data).dropDuplicates()
    
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong').cache()
    
    # rename fields
    fields = [("artist", "artist"),
          ("auth", "auth"),
          ("firstName", "first_name"),
          ("gender", "gender"),
          ("itemInSession", "itemInSession"),
          ("lastName", "last_name"),
          ("length", "length"),
          ("level", "level"),
          ("location", "location"),
          ("method", "method"),
          ("page", "page"),
          ("registration", "registration"),
          ("sessionId", "session_id"),
          ("song", "song"),
          ("status", "status"),
          ("ts", "ts"),
          ("userAgent", "user_agent"),
          ("userId", "user_id")
          ]
    exprs = [ "{} as {}".format(field[0],field[1]) for field in fields]
    df = df.selectExpr(*exprs)

    # extract columns for users table    
    print('Creating Users Table')
    users_columns = ['user_id', 'first_name', 'last_name', 'gender', 'level']
    users_table =  df.selectExpr(users_columns).distinct()
    
    # write users table to parquet files
    print('Writing to artists table')
    
    user_data_path = os.path.join(output_data, "users/")    
    users_table.write.parquet(user_data_path, mode="overwrite")
    

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S'))
    df = df.withColumn('timestamp', get_timestamp('ts'))

    #df = df.withColumn('start_time', get_datetime('ts'))
    
    # extract columns to create time table
    print('Creating Time Table')
    time_table = df.select(
        col('timestamp').alias('start_time'),
        hour('timestamp').alias('hour'),
        dayofmonth('timestamp').alias('day'),
        weekofyear('timestamp').alias('week'),
        month('timestamp').alias('month'),
        year('timestamp').alias('year'),
        date_format('timestamp', 'u').alias('weekday')).orderBy("start_time").drop_duplicates() 
                                        
                            
    # write time table to parquet files partitioned by year and month
    print('Writing to time table')
    
    time_data_path = os.path.join(output_data, "time/")    
    time_table.write.partitionBy("year", "month").parquet(time_data_path, mode="overwrite")
   

    # read in song data to use for songplays table
    print('Reading Songs Table')
    #song_df = spark.read.parquet(output_data + 'songs/*/*/*.parquet')
    
    song_df = spark.read.json(os.path.join(input_data, 
                                           'song_data/*/*/*/*.json')).selectExpr("song_id",
                                                                                 "title", 
                                                                                 "artist_id", 
                                                                                 "artist_name", 
                                                                                 "year", 
                                                                                 "duration").drop_duplicates()
    
    '''
    print('Creating Songs Play Table')
    songs_logs=df.join(song_df, (df.song == song_df.title))
    songplays_table = songs_logs.join(time_table, 
                                      songs_logs.timestamp == time_table.start_time)\
                                      .drop(songs_logs.year)\
                                      .drop(songs_logs.start_time)\
                                      .withColumn("songplay_id", monotonically_increasing_id())
    '''
    songplays_table = df.join(song_df, 
                           (df.song == song_df.title) & 
                           (df.artist == song_df.artist_name) & 
                           (df.length == song_df.duration) & 
                           (year(df.timestamp) == song_df.year), 'left_outer').select(df.timestamp.alias("start_time"), 
                                                                                    df.user_id,
                                                                                    df.level,
                                                                                    song_df.song_id,
                                                                                    song_df.artist_id,
                                                                                    df.session_id, 
                                                                                    df.location, 
                                                                                    df.user_agent,
                                                                                    year(df.timestamp).alias('year'),
                                                                                    month(df.timestamp).alias('month')).orderBy("start_time", "user_id").withColumn("songplay_id", F.monotonically_increasing_id())


    # extract columns from joined song and log datasets to create songplays table 
    songplays_table_col = ['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 
                           'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month']
    
    songplays_table = songplays_table.select(songplays_table_col).repartition("year", "month")

    # write songplays table to parquet files partitioned by year and month
    print('Writing to songs plays table')
    
    songplay_data_path = os.path.join(output_data, "songplay/")    
    songplays_table.write.partitionBy("year", "month").parquet(songplay_data_path, mode="overwrite")

    print('Done')
Ejemplo n.º 19
0
def process_log_data(spark, input_data, output_data):
    '''
    Function to read 'log-data' from S3 or local directory, extract 'users', 'time' and 'songplays' tables and load them as parquet files in S3. Each table has its own directory on S3.
    Parameters:
        - spark: Spark session object.
        - input_data: Path to base input data, can be local or S3.
        - output_data: Path to base output data at S3.
    Outputs:
        None
    '''
    # get filepath to log data file
    log_data = input_data + 'log-data'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df[df['page'] == 'NextSong']

    # extract columns for users table
    users_table = df.select(df.userId.alias('user_id'), \
                            df.firstName.alias('first_name'), \
                            df.lastName.alias('last_name'), \
                            df.gender.alias('gender'), \
                            df.level.alias('level')).dropDuplicates()

    # write users table to parquet files
    users_table.write.mode('append').parquet(output_data + '/users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda d: datetime.fromtimestamp(d / 1000.0),
                        TimestampType())
    df = df.withColumn('timestamp', get_timestamp('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda d: datetime.fromtimestamp(d / 1000.0),
                       DateType())
    df = df.withColumn('date', get_datetime('ts'))

    # extract columns to create time table
    time_table = df.select(df.ts.alias('start_time'), \
                           hour(df.timestamp).alias('hour'), \
                           dayofmonth(df.timestamp).alias('day'), \
                           weekofyear(df.timestamp).alias('week'), \
                           month(df.timestamp).alias('month'), \
                           year(df.timestamp).alias('year'), \
                           dayofweek(df.timestamp).alias('weekday')).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(
        ['year', 'month']).mode('append').parquet(output_data + '/time/')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + '/songs/')
    # extract columns from joined song and log datasets to create songplays table
    # songplays_table: songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
    songplays_table_join = df.join(song_df, (df.song == song_df.title) & \
                                            (df.length == song_df.duration)).dropDuplicates()

    songplays_table = songplays_table_join.select(songplays_table_join.ts.alias('start_time'), \
                                                  songplays_table_join.userId.alias('user_id'), \
                                                  songplays_table_join.level.alias('level'), \
                                                  songplays_table_join.song_id.alias('song_id'), \
                                                  songplays_table_join.artist_id.alias('artist_id'), \
                                                  songplays_table_join.sessionId.alias('session_id'), \
                                                  songplays_table_join.location.alias('location'), \
                                                  songplays_table_join.userAgent.alias('user_agent'), \
                                                  year(songplays_table_join.timestamp).alias('year'), \
                                                  month(songplays_table_join.timestamp).alias('month')).dropDuplicates()

    songplays_table.withColumn('songplay_id', monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(
        ['year', 'month']).mode('append').parquet(output_data + '/songplays/')
Ejemplo n.º 20
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where("page = 'NextSong'")

    # extract columns for users table
    users_table = df.selectExpr(
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender",
        "level").filter("user_id <> ''").orderBy("user_id").drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn('timestamp', get_timestamp('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d'))
    df = df.withColumn('datetime', get_datetime('ts'))

    # extract columns to create time table
    time_table = df.select(
        col('timestamp').alias('start_time'),
        hour('timestamp').alias('hour'),
        dayofmonth('timestamp').alias('day'),
        weekofyear('timestamp').alias('week'),
        month('timestamp').alias('month'),
        year('timestamp').alias('year'),
        date_format(
            'timestamp',
            'u').alias('weekday')).orderBy("start_time").drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time'))

    # read in song data to use for songplays table
    song_df = spark.read.json(
        os.path.join(input_data, 'song_data/*/*/*/*.json')).selectExpr(
            "song_id", "title", "artist_id", "artist_name", "year",
            "duration").drop_duplicates()

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(
        song_df,
        (df.song == song_df.title) & (df.artist == song_df.artist_name) &
        (df.length == song_df.duration) & (year(df.timestamp) == song_df.year),
        'left_outer').select(df.timestamp.alias("start_time"),
                             df.userId.alias("user_id"), df.level,
                             song_df.song_id, song_df.artist_id,
                             df.sessionId.alias("session_id"), df.location,
                             df.userAgent.alias("user_agent"),
                             year(df.timestamp).alias('year'),
                             month(df.timestamp).alias('month')).orderBy(
                                 "start_time", "user_id").withColumn(
                                     "songplay_id",
                                     F.monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays'))
Ejemplo n.º 21
0
def process_log_data(spark, input_data, output_data):
    """
    Function: Extract data from log_data files for user and time tables. From both log_data and song_data     files get data for songplays     table. Data written into parquet files and load into S3 bucket
    
    Prameter list
    spark:        session, spark session has been created. 
    input_data:   string of path, a path point to S3 bucket.
    output_data:  string of path, a path point to destination in S3.
    
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*.json'

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df_actions = df.filter(df.page == 'NextSong')\
                   .select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'length', 'location', 'userAgent')

    # extract columns for users table    
    users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates()
    
    users_table.createOrReplaceTempView('users')
    
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df_actions = df_actions.withColumn('timestamp', get_timestamp(df_actions.ts)) 
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x)/1000)))
    df_actions = df_actions.withColumn('datetime', get_datetime(df_actions.ts)) 
    
    # extract columns to create time table
    time_table = df_actions.select('datetime')\
                           .withColumn('start_time', df_actions.datetime)\
                           .withColumn('hour', hour('datetime'))\
                           .withColumn('day', dayofmonth('datetime'))\
                           .withColumn('week', weekofyear('datetime'))\
                           .withColumn('month', month('datetime'))\
                           .withColumn('year', year('datetime'))\
                           .withColumn('weekday', dayofweek('datetime'))\
                           .dropDuplicates()
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month')\
                    .parquet(os.path.join(output_data, 'time/time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    df_songs = spark.read.json(input_data + 'song_data/*/*/*/*.json')

    # extract columns from joined song and log datasets to create songplays table 
    df_song_actions = df_actions.join(df_songs, (df_actions['artist'] == df_songs['artist_name'])
                                  & (df_actions['song'] == df_songs['title'])
                                  & (df_actions['length'] == df_songs['duration']),'inner')
    
    songplays_table = df_song_actions.select(
        col('datetime').alias('start_time'),
        col('userId').alias('user_id'),
        col('level').alias('level'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('sessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        year('datetime').alias('year'),
        month('datetime').alias('month')
    ).withColumn('songplay_id', monotonically_increasing_id())
    

    # write songplays table to parquet files partitioned by year and month
    songplays_table.createOrReplaceTempView('songplays')
    
    time_table = time_table.alias('timetable')
    
    songplays_table.write.partitionBy(
        'year', 'month'
    ).parquet(os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')
Ejemplo n.º 22
0
def process_log_data(spark, input_data, output_data):
    """Obtains log data from json files and processes 
    them using spark, writing data to users_table, 
    time_table, and songplays_table parquet files
    
    Keyword arguments
    spark - spark connection
    input_data - path to read data from
    output_data - path to write parquet files to
    """
    # get filepath to log data file
    log_data = input_data + "/log_data/2018/11/*\.json"

    # read log data file
    print("Reading log_data in spark dataframe")
    df = spark.read.json(log_data)
    df.printSchema()
    print(df.count())

    # filter by actions for song plays
    print("Filter on NextSong")
    df = df.filter(df["page"] == "NextSong")
    print(df.count())

    # extract columns for users table
    print("extract columns to create users table")
    users_table = df.select(
        ["userId", "firstName", "lastName", "gender",
         "level"]).dropDuplicates()
    users_table.printSchema()
    print(users_table.count())

    # write users table to parquet files
    users_table_path = output_data + "/users_table"
    delete_table_if_exists(users_table_path)

    print("writing users_table")
    users_table.write.parquet(users_table_path)

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("timestamp", get_timestamp(df.ts))
    timestamp_df = df.select(["timestamp"]).dropDuplicates()
    timestamp_df.printSchema()
    print(timestamp_df.count())

    # extract columns to create time table
    time_table = timestamp_df.select(
        date_format("timestamp", "yyyy-MM-dd HH:mm:ss").alias("start_time"),
        hour("timestamp").alias("hour"),
        dayofmonth("timestamp").alias("day"),
        weekofyear("timestamp").alias("week"),
        month("timestamp").alias("month"),
        year("timestamp").alias("year"),
        dayofweek("timestamp").alias("weekday"))

    time_table.printSchema()
    print(time_table.count())

    # write time table to parquet files partitioned by year and month
    time_table_path = output_data + "/time_table"
    delete_table_if_exists(time_table_path)

    print("writing time_table")
    time_table.write.partitionBy("year", "month").parquet(time_table_path)

    # read in song data to use for songplays table
    song_table = spark.read.parquet(output_data + "/songs_table")
    artists_table = spark.read.parquet(output_data + "/artists_table")
    song_df = song_table.join(
        artists_table, song_table.artist_id == artists_table.artist_id).drop(
            song_table.artist_id)
    print("the join artist and song table")
    song_df.printSchema()
    print(song_df.count())

    # extract columns from joined song and log datasets to create songplays table
    df_join_song_df = df.join(
        song_df,
        (df.song == song_df.title) & (df.artist == song_df.artist_name) &
        (df.length == song_df.duration)).dropDuplicates()

    df_join_song_df = df_join_song_df.withColumn("songplay_id",
                                                 monotonically_increasing_id())
    df_join_song_df.printSchema()
    print(df_join_song_df.count())

    songplays_table = df_join_song_df.select(
        col("songplay_id"),
        date_format("timestamp", "yyyy-MM-dd HH:mm:ss").alias("start_time"),
        col("userId"), col("level"), col("song_id"), col("artist_id"),
        col("sessionId"), col("artist_location"), col("userAgent"))

    songplays_table.printSchema()

    # write songplays table to parquet files partitioned by year and month
    songplays_table_path = output_data + "/songplays_table"
    delete_table_if_exists(songplays_table_path)

    print("writing songplays_table")
    songplays_table.withColumn("year", year(
        songplays_table.start_time)).withColumn(
            "month", month(songplays_table.start_time)).write.partitionBy(
                "year", "month").parquet(songplays_table_path)
Ejemplo n.º 23
0
def process_log_data(spark, input_data, output_data):
    """ 
     This method is in charge of performing the ETL and creation of the tables related to the log_data using parquet, these tables are: users, time and songplays
     
     parameters:
         spark= represent the spark session. 
         input_data = represent the source root path of the data that will be processed
         output_data = represent the destination root path where the parquet tables will be created.
     
    """

    # get filepath to log data file
    log_data = input_data + "/log_data/*"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = filtered_log_data = df.where("page='NextSong'")

    # extract columns for users table
    users_columns = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]
    users_table = df.selectExpr(users_columns).orderBy(desc("ts")).distinct()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users.parquet", mode="ignore")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda epoch_in_millis: int(epoch_in_millis / 1000),
                        LongType())
    df = df.withColumn("timestamp", get_timestamp("ts"))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda epoch_in_seconds: datetime.fromtimestamp(epoch_in_seconds),
        TimestampType())
    df = df.withColumn("datetime", get_datetime("timestamp"))

    # extract columns to create time table
    time_columns = [
        col("datetime").alias("start_time"),
        hour("datetime").alias("hour"),
        dayofmonth("datetime").alias("day"),
        weekofyear("datetime").alias("week"),
        month("datetime").alias("month"),
        year("datetime").alias("year")
    ]
    time_table = df.select(time_columns).distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time.parquet",
                             mode="ignore",
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + "/song_data/*/*/*")

    # extract columns from joined song and log datasets to create songplays table
    song_plays_columns = [
        monotonically_increasing_id().alias('songplay_id'),
        get_datetime(get_timestamp("ts")).alias("start_time"),
        col("userId").alias("user_id"), "level", "song_id", "artist_id",
        col("sessionId").alias("session_id"), "location",
        col("userAgent").alias("user_agent")
    ]
    songplays_table = df.join(song_df, df.song == song_df.title,
                              "inner").select(song_plays_columns)

    # write songplays table to parquet files partitioned by artist_id
    songplays_table.write.parquet(output_data + "songplays.parquet",
                                  mode="ignore",
                                  partitionBy="artist_id")
Ejemplo n.º 24
0
def process_log_data(spark, input_data, output_data):
    """This function will ingest the data from the log-data path. Also will create the parquets files for the users table and the time table. In the case of the songplays table, it will be joined with the data from the song_path."""

    # get filepath to log data file
    log_data = input_data + "log-data"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(col("page") == 'NextSong').filter(df.userId.isNotNull())

    # extract columns for users table
    users_table = df.select(
        col("userId").alias("user_id"),
        col("firstName").alias("first_name"),
        col("lastName").alias("last_name"), "gender",
        "level").dropDuplicates()

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data + "users")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn("timestamp", get_timestamp(col("ts")))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(col("ts")))

    # extract columns to create time table
    time_table = df.select('timestamp',
                           hour('datetime').alias('hour'),
                           dayofmonth('datetime').alias('day'),
                           weekofyear('datetime').alias('week'),
                           month('datetime').alias('month'),
                           year('datetime').alias('year'),
                           date_format('datetime', 'F').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    tsFormat = "yyyy/MM/dd HH:MM:ss z"
    songplays_table = song_df.join(
        df, song_df.artist_name == df.artist).withColumn(
            "songplay_id", monotonically_increasing_id()).withColumn(
                'start_time',
                to_timestamp(
                    date_format(
                        (col("ts") / 1000).cast(dataType=TimestampType()),
                        tsFormat),
                    tsFormat)).select("songplay_id", "start_time",
                                      col("userId").alias("user_id"), "level",
                                      "song_id", "artist_id",
                                      col("sessionId").alias("session_id"),
                                      col("artist_location").alias("location"),
                                      "userAgent",
                                      month(col("start_time")).alias("month"),
                                      year(col("start_time")).alias("year"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "songplays")
Ejemplo n.º 25
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location',
                         'userAgent']

    # extract columns for users table
    users_table = df['userId', 'firstName', 'lastName', 'gender',
                     'level'].dropDuplicates(['userId'])

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users.parquet'),
                              'overwrite')
    print("users.parquet completed")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select(
        col('datetime').alias('start_time'),
        hour('datetime').alias('hour'),
        dayofmonth('datetime').alias('day'),
        weekofyear('datetime').alias('week'),
        month('datetime').alias('month'),
        year('datetime').alias('year'))
    time_table = time_table.dropDuplicates(['start_time'])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time.parquet'), 'overwrite')
    print("time.parquet completed")

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song-data/A/A/A/*.json")
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    df = df.join(song_df, song_df.title == df.song)

    songplays_table = df.select(
        col('ts').alias('ts'),
        col('userId').alias('user_id'),
        col('level').alias('level'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('ssessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        col('year').alias('year'),
        month('datetime').alias('month'))

    songplays_table = songplays_table.selectExpr("ts as start_time")

    songplays_table.select(
        monotonically_increasing_id().alias('songplay_id')).collect()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays.parquet'), 'overwrite')
    print("songplays.parquet completed")
    print("process_log_data completed")
Ejemplo n.º 26
0
def main():

    # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME'])

    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)

    # tao timebegin bang ngay hien tai + 1 nga
    # time_begin = datetime.date.today() + datetime.timedelta(days=1)
    #20190908
    time_begin = datetime.date(2019, 1, 1)

    # tao timeend bang ngay hien tai + 1 thang - 1 ngay
    # time_end = time_begin + relativedelta(months=1) - datetime.timedelta(days=1)

    time_end = datetime.date(2020, 2, 28)

    # tao dataframe tu time_begin va time_end
    data = [(time_begin, time_end)]
    df = spark.createDataFrame(data, ["minDate", "maxDate"])
    # convert kieu dl va ten field
    df = df.select(df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate"))

    # chay vong lap lay tat ca cac ngay giua mindate va maxdate
    df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \
        .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \
        .select("*", f.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \
        .select('date')

    # convert date thanh cac option ngay_thang_nam
    df = df.withColumn('id', date_format(df.date, "yyyyMMdd")) \
        .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \
        .withColumn('ngay_trong_tuan', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \
        .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \
        .withColumn('thang', f.month(df.date)) \
        .withColumn('quy', f.quarter(df.date)) \
        .withColumn('nam', f.year(df.date))
    df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1)

    data_time = DynamicFrame.fromDF(df, glueContext, 'data_time')

    # convert data
    data_time = data_time.resolveChoice(specs=[('tuan_trong_thang', 'cast:int')])

    # chon cac truong va kieu du lieu day vao db
    applymapping1 = ApplyMapping.apply(frame=data_time,
                                       mappings=[("id", "string", "id", "bigint"),
                                                 ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'),
                                                 ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"),
                                                 ("tuan_trong_thang", "int", "tuan_trong_thang", "int"),
                                                 ("tuan_trong_nam", "int", "tuan_trong_nam", "int"),
                                                 ("thang", "int", "thang", "int"),
                                                 ("quy", "int", "quy", "int"),
                                                 ("nam", "int", "nam", "int"),
                                                 ("date", "date", "ngay", "timestamp")])

    resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3")

    # ghi dl vao db
    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3,
                                                               catalog_connection="glue_redshift",
                                                               connection_options={"dbtable": "student.time_dim",
                                                                                   "database": "student_native_report"},
                                                               redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/fdfdf",
                                                               transformation_ctx="datasink4")
Ejemplo n.º 27
0
def process_log_data(spark, input_data, output_data):
    """
    This function extracts json file for log_data, transform, and save as parquet files to a filepath that has been provided as an arugment.
    For the purpose of joining tables across the files(song_data and log_data), it extracts the loaded songs tbale which is processed in the process_song_data function.
    
    INPUTS:
    * spark - the spark session to run this function on
    * input_data - the filepath to the log_data which is resided in S3 bucket
    * output_data - the filepath to the transformed data which gets stored in another S3 bucket
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    log_df = spark.read.json(log_data)

    # filter by actions for song plays
    log_df = log_df.filter(log_df.page == "NextSong")

    # extract columns for users table
    users_table = log_df.select(
        ['userId', 'firstName', 'lastName', 'gender', 'level'])

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data + "users")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000, TimestampType())
    log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
    log_df = log_df.withColumn("start_time", get_datetime(log_df.timestamp))

    # extract columns to create time table
    time_table = log_df.withColumn("hour", hour("start_time")) \
        .withColumn("day", dayofmonth("start_time")) \
        .withColumn("week", weekofyear("start_time")) \
        .withColumn("month", month("start_time")) \
        .withColumn("year", year("start_time")) \
        .withColumn("weekday", dayofweek("start_time"))\
    .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    song_df = os.path.join(output_data, 'songs/')
    song_data = spark.read.parquet(song_df)

    # extract columns from joined song and log datasets to create songplays table
    joined_table = log_df.join(song_data,
                               log_df.song == song_data.title,
                               how='inner')

    songplays_table = joined_table.select("start_time",col("userId").alias("user_id"),"level","song_id","artist_id",col("sessionId").alias("session_id"),"location",col("userAgent").alias("user_agent"))\
                                   .withColumn("songplay_id", monotonically_increasing_id())

    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\
                                      .select("songplay_id", songplays_table.start_time,"user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "songplays")
Ejemplo n.º 28
0
def process_log_data(spark, input_data, output_data):
    """Function to read raw data, about logs and songs, in json format from S3 putting
    the data in the right format (indicated schema).
    Then, from this first table, the dimensions tables users_table
    and time_table as well as the fact table songplays_table are created 
    and saved into the specified local folder.

    Args:
        spark: SparkSession to handle data using Spark;
        input_data: general path where the data resides locally;
        output_data: folder's path where the new tables will be stored.
    """

    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data')

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')
    # extract columns for users table
    columns_users = [
        'userId as user_id', 'firstName as first_name',
        'lastName as last_name', 'gender', 'level'
    ]

    # create users_table and drop duplicated rows
    users_table = df.selectExpr(columns_users).dropDuplicates()

    # write users table to parquet files
    users_table.write.csv(os.path.join(output_data, 'users_table'),
                          mode='overwrite',
                          header=True)

    # create timestamp column from original timestamp column
    df = df.withColumn('start_time', F.to_timestamp(df.ts / 1000))

    # extract columns to create time table
    columns_time = [
        'start_time',
        F.hour('start_time').alias('hour'),
        F.dayofmonth('start_time').alias('day'),
        F.weekofyear('start_time').alias('week'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'),
        F.dayofweek('start_time').alias('weekday')
    ]

    # create time table and drop duplicated rows
    time_table = df.select(columns_time).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.csv(os.path.join(output_data, 'time_table'),
                         mode='overwrite',
                         header=True)

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, 'song_data')
    schema = StructType([
        StructField('num_songs', IntegerType()),
        StructField('artist_id', StringType()),
        StructField('artist_latitude', FloatType()),
        StructField('artist_longitude', FloatType()),
        StructField('artist_location', StringType()),
        StructField('artist_name', StringType()),
        StructField('song_id', StringType()),
        StructField('title', StringType()),
        StructField('duration', DoubleType()),
        StructField('year', IntegerType())
    ])
    song_df = spark.read.option("recursiveFileLookup",
                                "true").json(song_data, schema=schema)

    # extract columns from joined song and log datasets to create songplays table
    columns_songplay = [
        'start_time', 'userId as user_id', 'level', 'song_id', 'artist_id',
        'sessionId as session_id', 'location', 'userAgent as user_agent'
    ]

    condition = [
        df.song == song_df.title, df.length == song_df.duration,
        df.artist == song_df.artist_name
    ]
    songplays_table = df.join(song_df, on=condition,
                              how='left_outer').selectExpr(columns_songplay)

    # create songplay_id column
    songplays_table = songplays_table.withColumn(
        'songplay_id',
        F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.csv(os.path.join(output_data, 'songplays_table'),
                              mode='overwrite',
                              header=True)
Ejemplo n.º 29
0
def process_log_data(spark, input_data, output_data):
    """
    Process the log data from S3 storage and create the analytical tables, users table, time tables, and songsplay table.
    
    This function read the data in json files from the S3 storage, transforme the data into tha analytcal tables
    (users, time, and songplays), and write it into partitioned parquet files on S3.
    
    Args:
        spark: the spark session
        input_data: the S3 bucket to read data from
        output_data: the S3 bucket to write analytics tables to
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # define the log data schema
    log_data_schema = R([
        Fld("artist", Str(), True),
        Fld("auth", Str(), False),
        Fld("firstName", Str(), True),
        Fld("gender", Str(), True),
        Fld("itemInSession", Int(), False),
        Fld("lastName", Str(), True),
        Fld("length", Dbl(), True),
        Fld("level", Str(), False),
        Fld("location", Str(), True),
        Fld("method", Str(), False),
        Fld("page", Str(), False),
        Fld("registration", Dbl(), True),
        Fld("sessionId", Int(), False),
        Fld("song", Str(), True),
        Fld("status", Int(), False),
        Fld("ts", Dbl(), False),
        Fld("userAgent", Str(), True),
        Fld("userId", Str(), True)
    ])

    # read log data file
    df = spark.read.json(log_data, schema=log_data_schema)

    # filter by actions for song plays
    df = df.filter(col("page") == "NextSong")

    # extract columns for users table
    users_table = (df.filter((col("userID") != "")
                             & (col("userID").isNotNull())).select(
                                 col('userId').alias('user_id'),
                                 col('firstName').alias('first_name'),
                                 col('lastName').alias('last_name'), "gender",
                                 "level").distinct())

    # write users table to parquet files
    users_table.write.parquet(output_data + "users_table.parquet",
                              mode="overwrite")

    # create datetime column from original timestamp column
    df = df.withColumn('start_time', to_timestamp(df['ts'] / 1000))

    # extract columns to create time table
    time_table = (df.withColumn("hour", hour("start_time")).withColumn(
        "day", dayofmonth("start_time")).withColumn(
            "week", weekofyear("start_time")).withColumn(
                "month", month("start_time")).withColumn(
                    "year", year("start_time")).withColumn(
                        "weekday", dayofweek("start_time")).select(
                            "start_time", "hour", "day", "week", "month",
                            "year", "weekday").distinct())

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time_table.parquet",
                             mode="overwrite",
                             partitionBy=["year", "month"])

    # read in song and artist data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs_table.parquet")
    artist_df = spark.read.parquet(output_data + "artists_table.parquet")

    song_df = (song_df.join(artist_df, "artist_id",
                            "full").select("song_id", "title", "artist_id",
                                           "name", "duration"))

    # join the song data with log data and save it in songplays_table variable
    songplays_table = df.join(song_df, [
        df.song == song_df.title, df.artist == song_df.name, df.length
        == song_df.duration
    ], "left")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = (songplays_table.withColumn(
        "songplay_id", monotonically_increasing_id()).select(
            "songplay_id",
            "start_time",
            col("userId").alias("user_id"),
            "level",
            "song_id",
            "artist_id",
            col("sessionId").alias("session_id"),
            "location",
            col("userAgent").alias("user_agent"),
        ).withColumn("year",
                     year("start_time")).withColumn("month",
                                                    month("start_time")))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays_table.parquet",
                                  mode="overwrite",
                                  partitionBy=["year", "month"])
Ejemplo n.º 30
0
def process_log_data(spark, input_data, output_data):
    """
    reads JSON file from S3, transforms it,
    and outputs them back to S3 as partitioned parquet files (as fact and dimensional tables)

    :param spark: an existing SparkSession
    :param input_data: S3 bucket directory (e.g., "s3a://udacity-dend/")
    :param output_data: output directory (e.g., 'output/')
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/2018/11/*.json'  # TEST
    # TEST
    # log_data = os.path.join(input_data, 'log_data/*/*/*.json')    # FINAL
    # FINAL

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong')

    # extract columns for users table
    # dim table: users
    users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']
    users_table = users_table.dropDuplicates(['userId'])
    # or, shall I do style from here -- https://spark.apache.org/docs/latest/sql-getting-started.html

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ms: datetime.fromtimestamp(ms / 1000.0).
                        strftime('%Y-%m-%d %H:%M:%S'))
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    # Note from mentor: You can ignore the get_datetime part as the timestamp creation is enough

    # extract columns to create time table
    # dim table: time
    # imported functions: year, month, dayofmonth, hour, weekofyear, date_format
    time_table = df.select(col('start_time'),
                           hour(df.start_time).alias('hour'),
                           dayofmonth(df.start_time).alias('dayofmonth'),
                           month(df.start_time).alias('month'),
                           year(df.start_time).alias('year'),
                           weekofyear(df.start_time).alias('weekofyear') \
                           # date_format(df.start_time).alias('date_format')  -- column not needed, right?

                 ).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time'))

    # read in song data to use for songplays table
    song_df = spark.read.parquet(input_data + 'song_data/A/A/B/*.json')  # TEST
    # TEST
    # song_df = spark.read.parquet(input_data + 'song_data/*/*/*/*.json')   # FINAL
    # FINAL

    # extract columns from joined song and log datasets to create songplays table
    # fact table: songplays
    #  LEFT: df, aka log datasets
    # RIGHT: song_df, aka song datasets, coming from process_log_data()
    joint_df = df.join(song_df,
                       (df.artist == song_df.artist_name) &
                       (df.length == song_df.duration) &
                       (df.song == song_df.title), 'left_outer' \
               ).dropDuplicates()

    # extract columns from joint_df
    songplays_table = joint_df.select(col('start_time'),
                                      col('userId').alias('user_id'),
                                      df.level,
                                      song_df.song_id, song_df.artist_id,
                                      col('sessionId').alias('session_id'),
                                      df.location,
                                      col('userAgent').alias('user_agent'),
                                      year('start_time').alias('year'),
                                      month('start_time').alias('month') \
                      ).withColumn('songplay_id', monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays'))