Beispiel #1
0
def parse_dates(df, format):
    """
    Parses dateinto year,month,day
    :param df: input df
    :param format: the format of the timestamp
    :return: dataframe
    """
    return df.withColumn('parsed_date',
                         f.to_timestamp(f.col('transaction_date'), format)) \
        .withColumn("year", f.year(f.col('parsed_date'))) \
        .withColumn("month", f.month(f.col('parsed_date'))) \
        .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \
        .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \
        .drop("transaction_date")
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = "s3a://udacity-dend/log_data/*/*/*.json"

    # read log data file
    df_log = spark.read.json(log_data)

    # filter by actions for song plays
    df_log = df_log.where(df_log.page == "NextSong")

    # extract columns for users table
    users_table =
    df_log.select(
        col("userId").alias("user_id"),
        col("firstName").alias("first_name"),
        col("lastName").alias("last_name"),
        "gender", "level")
    .dropDuplicates()

    # write users table to parquet files
    users_table.write.format("parquet").save("s3a://udacity-dend/users")

    # create timestamp column from original timestamp column
    get_timestamp =
    udf(lambda x: datetime.fromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S'))
    df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts))

    # create datetime column from original timestamp column
    get_datetime =
    udf(lambda x: datetime.fromtimestamp(x/1000).strftime('%Y-%m-%d'))
    df_log = df_log.withColumn("datetime", get_datetime(df_log.ts))

    # extract columns to create time table
    time_table =
    df_log.select(
        df_log.timestamp.alias('start_time'),
        hour(df_log.datetime).alias('hour'),
        dayofmonth(df_log.datetime).alias('day'),
        weekofyear(df_log.datetime).alias('week'),
        month(df_log.datetime).alias('month'),
        year(df_log.datetime).alias('year'),
        date_format(df_log.datetime, 'u').alias('weekday'))
    .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").
    format("parquet").save("s3a://udacity-dend/time")

    # read in song data to use for songplays table
    song_df = log_song

    # extract columns from joined song and log datasets to create songplays table
    cond = [df_song.title == df_log.song, df_song.artist_name == df_log.artist]
    df = df_log.join(df_song, cond, 'outer')
    .withColumn("songplay_id", monotonically_increasing_id())
    songplays_table =
    df.select(
        df.songplay_id, df.timestamp.alias('start_time'),
        df.userId.alias('user_id'), df.level,
        df.song_id, df.artist_id, df.sessionId.alias('session_Id'),
        df.location, df.userAgent.alias('user_agent'),
        year(df.datetime).alias('year'), month(df.datetime).alias('month'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month")
    .format("parquet").save("s3a://udacity-dend/songplays")
def process_log_data(spark, input_data, output_data):
    '''
    Process the log data from s3 bucket to the dataframes
    Arguments : spark           - session variable
                log_data_local  - logdata location in s3 Bucket
                output_data     - store the output parquet files
    '''


    logdata_path = os.path.join(input_data, "log-data/*/*/*.json")

    df_logdata = spark.read.json(logdata_path)
    df_logdata.printSchema()
    df_logdata_filter = df_logdata.filter(df_logdata.page=='NextSong')

    #Process user table
    users_df = df_logdata_filter['userId', 'firstName', 'lastName', 'gender', 'level']
    users_df = users_df.dropDuplicates(['userId'])
    users_df.printSchema()
    users_df.show(5, truncate= False)

    users_output = output_data + "users.parquet"
    users_df.write.mode("overwrite").parquet(users_output)

    #Process time table
    #create user defined functions to convert 'ts'
    @udf(t.TimestampType())
    def get_timestamp(ts):
        return datetime.fromtimestamp(ts/1000.0)
    df_logdata_filter = df_logdata_filter.withColumn('timestamp', get_timestamp("ts"))

    @udf(t.StringType())
    def get_datetime(ts):
        return datetime.fromtimestamp(ts/1000.0).strftime('%Y-%m-%d %H:%M:%S')

    df_logdata_filter = df_logdata_filter.withColumn('datetime',get_datetime("ts"))

    df_logdata_filter.printSchema()
    df_logdata_filter.show(5truncate= False)
    # extract columns to create time table
    time_df = df_logdata_filter.select(
                 col('datetime').alias('start_time')
                ,hour('datetime').alias('hour')
                ,dayofmonth('datetime').alias('day')
                ,weekofyear('datetime').alias('week')
                ,month('datetime').alias('month')
                ,year('datetime').alias('year')
               )
    time_df = time_df.dropDuplicates(['start_time'])
    time_df.printSchema()
    time_df.show(5, truncate=False)

    time_output = output_data + "time.parquet"
    time_df.write.mode("overwrite").partitionBy("year","month").parquet(time_output)

    songdata_path = os.path.join(input_data, "song-data/A/A/A/*.json")
    df_songdata = spark.read.json(songdata_path)

    df_logdata_filter = df_logdata_filter.join(df_songdata, df_songdata.title == df_logdata_filter.song )

    songplays_df = df_logdata_filter.select(
                 col('ts').alias('ts')
                ,col('userId').alias('user_id')
                ,col('level').alias('level')
                ,col('song_id').alias('song_id')
                ,col('artist_id').alias('artist_id')
                ,col('ssessionId').alias('session_id')
                ,col('location').alias('location')
                ,col('userAgent').alias('user_agent')
                ,col('year').alias('year')
                ,month('datetime').alias('month'))

    songplays_df = songplays_df.selectExpr("ts as start_time")
    songplays_df.select(monotonically_increasing_id().alias('songplay_id')).collect()

    songplays_output = output_data + "songplays.parquet"
    songplays_df.write.mode("overwrite").partitionBy("year","month").parquet(songplays_output)
Beispiel #4
0
def process_log_data(spark, input_data, output_data):
    """
    Description:
            Process the event log file and extract data for table time, users and songplays from it.
    :param spark: a spark session instance
    :param input_data: input file path
    :param output_data: output file path
    """

    # get filepath to log data file
    log_data = input_data + "log-data/*"

    # read log data file
    df = spark.read.json(
        log_data,
        mode='PERMISSIVE',
        columnNameOfCorruptRecord='corrupt_record').drop_duplicates()

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, "users/"),
                              mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    time_table = df.withColumn("hour",hour("start_time"))\
                    .withColumn("day",dayofmonth("start_time"))\
                    .withColumn("week",weekofyear("start_time"))\
                    .withColumn("month",month("start_time"))\
                    .withColumn("year",year("start_time"))\
                    .withColumn("weekday",dayofweek("start_time"))\
                    .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(os.path.join(output_data, "time_table/"),
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.read\
                .format("parquet")\
                .option("basePath", os.path.join(output_data, "songs/"))\
                .load(os.path.join(output_data, "songs/*/*/"))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
                        .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent"))

    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\
                        .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.drop_duplicates().write.parquet(
        os.path.join(output_data, "songplays/"),
        mode="overwrite",
        partitionBy=["year", "month"])
Beispiel #5
0
def process_log_data(spark, input_data, output_data):
    """
    Process Log Data procedure
    ETL process read and load song json file 
    and extract users, time and songplays data and save as *.parquet file
    to either project workspace or to an S3 bucket.  Also,
    add logging information to track the ETL process
    """

    loadTimes = []
    print('======= Read log data json files to dfLog dataframe =======')

    log_data = input_data + "log_data/2018/11"
    t0 = time()

    print('Path: ' + log_data)
    print('dfLog = spark.read.json(log_data)')

    dfLog = spark.read.json(log_data)
    cnt = dfLog.count()

    print('Total count of log data: ' + str(cnt))

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    print('======= Read song data json files to dfSong dataframe =======')
    song_data = input_data + "song_data/*/*/*"
    dataSong = input_data + "/song_data/A/A/A/"

    print('Path: ' + song_data)
    print("Load schema a song file")
    print(
        "dfGetSampleSchema = spark.read.options(samplingRatio=0.1).json(dataSong).schema"
    )

    loadTimes = []
    t0 = time()

    dfGetSampleSchema = spark.read.options(
        samplingRatio=0.1).json(dataSong).schema
    songschema = dfGetSampleSchema

    print('dfSong = spark.read.json(song_data, schema=songschema) ')
    dfSong = spark.read.json(song_data, schema=songschema)

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    # extract columns for users data and drop duplicate userId

    print('======= Users: Extract fields and drop duplicates data =======')
    print('dfLog.select("userId","firstName", "lastName", "gender", "level")')
    print('')
    loadTimes = []
    t0 = time()

    users_table = dfLog.select("userId", "firstName", "lastName", "gender",
                               "level")
    users_table = users_table.dropDuplicates(['userId'])

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    # create users parquet file(s)

    print('======= Users: Create users parquet =======')
    print('users_table.write.mode(overwrite).parquet(users_parguet)')
    loadTimes = []
    t0 = time()

    users_parguet = output_data + "users.parquet"
    users_table.write.mode('overwrite').parquet(users_parguet)

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    # create timestamp/datetime column and extract columns from original timestamp column

    print(
        '======= Time: Create Time table from ts column and drop duplicates data ======='
    )
    print(
        'time_table.withColumn(datetime, from_unixtime((time_table.ts/1000) .........'
    )

    loadTimes = []
    t0 = time()

    time_table = dfLog.select("ts")
    time_table = time_table.withColumn('datetime', from_unixtime((time_table.ts/1000),'yyyy-MM-dd HH:mm:ss.SSSS')) .\
                    withColumn('hour', hour('datetime')) .\
                    withColumn('day', dayofmonth('datetime')) .\
                    withColumn('week', weekofyear('datetime')) .\
                    withColumn('month', month('datetime')) .\
                    withColumn('year', year('datetime')) .\
                    withColumn('weekday', dayofweek('datetime')) .\
                    withColumnRenamed('ts','milliseconds') .\
                    withColumn('datetime', F.to_timestamp('datetime'))

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    # Create Time parquet and partition by year and month

    print('======= Time: Create time parquet =======')
    print(
        'time_table.write.mode(overwrite).partitionBy("year","month").parquet(time_parquet)'
    )

    loadTimes = []
    t0 = time()

    time_parquet = output_data + "time.parquet"
    time_table.write.mode('overwrite').partitionBy(
        "year", "month").parquet(time_parquet)

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    # extract columns from song and log json to create songplays

    print('======= SongPlays: Create SongPlays Time =======')
    print('Join logfiles and songfiles data to create the SongPlays dataset')
    print(
        'Create temp views to be used in sql statment to load songplays dataframe'
    )

    loadTimes = []
    t0 = time()

    #Create temp views to be used in songplays query needed to created parquet file

    print('>>> dfLog.createOrReplaceTempView(Log) ....')
    dfLog.createOrReplaceTempView("Log")

    print('>>> dfSong.createOrReplaceTempView(Songs) ....')
    dfSong.createOrReplaceTempView("Songs")

    print('>>> dfTimeTable.createOrReplaceTempView("Time") .....')
    time_table.createOrReplaceTempView("Time")

    print('Load dataframe songplays_table based on sql statement ')
    print('spark.sql(select t.year, t.month, datetime start_time,......)')

    #Use spark sql to create the necessary dataset to load songplays table/parquet
    songplays_table = spark.sql("""
                            select  
                                   t.year, t.month, datetime start_time, 
                                   userid, level, s.song_id, s.artist_id, 
                                   sessionId, location, userAgent
                                from Log l 
                                inner join Time t
                                    on l.ts = t.milliseconds
                                left join Songs s
                                    on  s.artist_name = l.artist
                                    and s.title = l.song
                                    and s.duration = l.length
                             """)
    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))

    #print("Remove duplicates")
    #print("songplays_table = songplays_table.dropDuplicates(['userid','level','song_id','artist_id','sessionId'])")
    #songplays_table = songplays_table.dropDuplicates(['userid','level','song_id','artist_id','sessionId'])

    print("Add unique index id name called songplays_id")
    print(
        "songplays_table.withColumn('songplays_id',monotonically_increasing_id() +1)"
    )

    songplays_table = songplays_table.withColumn(
        "songplays_id",
        monotonically_increasing_id() + 1)

    # write songplays table to parquet files partitioned by year and month

    print('======= SongPlays: Create SongPlays parquet =======')
    print(
        'songplays_table.write.mode(overwrite).partitionBy("year","month").parquet(songplays_parquet)'
    )

    loadTimes = []
    t0 = time()

    songplays_parquet = output_data + "songplays.parquet"
    songplays_table.write.mode('overwrite').partitionBy(
        "year", "month").parquet(songplays_parquet)

    loadTime = time() - t0
    loadTimes.append(loadTime)
    print("=== DONE IN: {0:.2f} sec\n".format(loadTime))
Beispiel #6
0
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

glueContext = GlueContext(SparkContext.getOrCreate())

df = spark.read.format("com.databricks.spark.csv").schema(customSchema).option(
    "quote",
    '"').option("header",
                "true").option("delimiter",
                               ',').load('s3://fsi406-xetra-${user}/*/*.csv')

df1 = df.withColumn("Timestamp", to_timestamp(mergeCols(("Date"), ("Time"))))
df2 = df1.drop("Date", "Time")
df3 = df2.withColumn("Year", year("Timestamp")).withColumn(
    "Month", month("Timestamp")).withColumn("Day", dayofmonth("Timestamp"))

dynaframe = DynamicFrame.fromDF(df3, glueContext, "xetra")
glueContext.write_dynamic_frame.from_options(
    frame=dynaframe,
    connection_type="s3",
    connection_options={
        "path": "s3://fsi406-parquet-${user}/",
        "partitionKeys": ["year", "month", "day"],
        "mode": "overwrite"
    },
    format="parquet")
Beispiel #7
0
def process_log_data(spark, input_data, output_data):
    """
    Load data from log_data dataset and extract columns
    for users and time tables, reads both the log_data and song_data
    datasets and extracts columns for songplays table with the data.
    It writes the data into parquet files which will be loaded on s3.
    Parameters
    ----------
    spark: session
          This is the spark session that has been created
    input_data: path
           This is the path to the log_data s3 bucket.
    output_data: path
            This is the path to where the parquet files will be written.
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    actions_df = df.filter(df.page == 'NextSong') \
                   .select('ts', 'userId', 'level', 'song', 'artist',
                           'sessionId', 'location', 'userAgent')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level').dropDuplicates()
    users_table.createOrReplaceTempView('users')
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'),
                              'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    actions_df = actions_df.withColumn('timestamp',
                                       get_timestamp(actions_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts))

    # extract columns to create time table
    time_table = actions_df.select('datetime') \
                           .withColumn('start_time', actions_df.datetime) \
                           .withColumn('hour', hour('datetime')) \
                           .withColumn('day', dayofmonth('datetime')) \
                           .withColumn('week', weekofyear('datetime')) \
                           .withColumn('month', month('datetime')) \
                           .withColumn('year', year('datetime')) \
                           .withColumn('weekday', dayofweek('datetime')) \
                           .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month') \
                    .parquet(os.path.join(output_data,
                                          'time/time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json')

    # extract columns from joined song and log datasets to create songplays table
    actions_df = actions_df.alias('log_df')
    song_df = song_df.alias('song_df')
    joined_df = actions_df.join(
        song_df,
        col('log_df.artist') == col('song_df.artist_name'), 'inner')
    songplays_table = joined_df.select(
        col('log_df.datetime').alias('start_time'),
        col('log_df.userId').alias('user_id'),
        col('log_df.level').alias('level'),
        col('song_df.song_id').alias('song_id'),
        col('song_df.artist_id').alias('artist_id'),
        col('log_df.sessionId').alias('session_id'),
        col('log_df.location').alias('location'),
        col('log_df.userAgent').alias('user_agent'),
        year('log_df.datetime').alias('year'),
        month('log_df.datetime').alias('month')) \
        .withColumn('songplay_id', monotonically_increasing_id())

    songplays_table.createOrReplaceTempView('songplays')
    # write songplays table to parquet files partitioned by year and month
    time_table = time_table.alias('timetable')

    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')

'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created'''
df_nycflights = df_nycflights. \
                drop('year'). \
                drop('month'). \
                drop('day'). \
                drop('hour'). \
                drop('minute'). \
                drop('date'). \
                drop('time')

df_nycflights.show() 

'''Now we extract the fields back'''
df_nycflights = df_nycflights. \
                withColumn('year',year(df_nycflights.timestamp)). \
                withColumn('month',month(df_nycflights.timestamp)). \
                withColumn('day',dayofmonth(df_nycflights.timestamp)). \
                withColumn('hour',hour(df_nycflights.timestamp)). \
                withColumn('minute',minute(df_nycflights.timestamp))  

df_nycflights.show()

'''Now few operations on timestamp '''
df_nycflights = df_nycflights.\
                withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \
                withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \
                withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp))

df_nycflights.show()                 
def process_log_data(spark, input_data, output_data):
    """ Processing log data (users, time table, songplay) by the JSON given by S3,
        after data normalization and transformation
        these data are wrote as parquet files """   
    
    """ Proving JSON structure to Spark """
    logdata_schema = StructType([
        StructField("artist", StringType(), True),
        StructField("auth", StringType(), True),
        StructField("firstName", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("itemInSession", LongType(), True),
        StructField("lastName", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("level", StringType(), True),
        StructField("location", StringType(), True),
        StructField("method", StringType(), True),
        StructField("page", StringType(), True),
        StructField("registration", DoubleType(), True),
        StructField("sessionId", LongType(), True),
        StructField("song", StringType(), True),
        StructField("status", LongType(), True),
        StructField("ts", LongType(), True),
        StructField("userAgent", StringType(), True),
        StructField("userId", StringType(), True),
    ])
        
    # get filepath to log data file
    log_data = input_data + 'log-data'

    # read log data file, JSON structure
    df = spark.read.json(log_data, schema = logdata_schema)
    
    # filter by actions for song plays
    df = df.filter(col("page") == 'NextSong')
    
    # extract columns for users table
    users_table = df.select(col("userId").alias("user_id"),col("firstName").alias("first_name"),
                            col("lastName").alias("last_name"),"gender","level")
    
    # write users table to parquet files
    users_table.write.parquet(output_data+"users")

    tsFormat = "yyyy-MM-dd HH:MM:ss z"
    # Converting ts to a timestamp format    
    time_table = df.withColumn('ts',
                               to_timestamp(date_format((df.ts 
                                                         /1000).cast(dataType=TimestampType()), tsFormat), tsFormat))

    # extract columns to create time table    
    time_table = time_table.select(col("ts").alias("start_time"),
                                   hour(col("ts")).alias("hour"),
                                   dayofmonth(col("ts")).alias("day"), 
                                   weekofyear(col("ts")).alias("week"), 
                                   month(col("ts")).alias("month"),
                                   year(col("ts")).alias("year"))

    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year","month").parquet(output_data+"time")

    # read in song data to use for songplays table
    song_data = input_data+"song-data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = song_df.join(df, song_df.artist_name==df.artist).
    withColumn("songplay_id", monotonically_increasing_id()).
    withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormat),tsFormat)).                             select("songplay_id",
           "start_time",                         
           col("userId").alias("user_id"),
           "level",
           "song_id",
           "artist_id",
           col("sessionId").alias("session_id"),
           col("artist_location").alias("location"),
           "userAgent",
           month(col("start_time")).alias("month"),
           year(col("start_time")).alias("year"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year","month").parquet(output_data+"songplays")
def process_log_data(spark, input_data, output_data):
    """ Function to process User, Time and Song Plays data from the json 
        files under the log-data folder and load it in parquet format on 
        a public S3 bucket. """
    # get filepath to log data file
    log_data = input_data + 'log-data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(col("page") == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        col("userId").alias("user_id"),
        col("firstName").alias("first_name"),
        col("lastName").alias("last_name"), "gender", "level")

    # write users table to parquet files
    users_table.write.parquet(output_data + "users")

    # define ts format
    tsFormat = "yyyy-MM-dd HH:MM:ss z"

    # convert ts to a timestamp format
    time_table = df.withColumn(
        'ts',
        to_timestamp(
            date_format((df.ts / 1000).cast(dataType=TimestampType()),
                        tsFormat), tsFormat))

    # extract columns to create time table
    time_table = time_table.select(
        col("ts").alias("start_time"),
        hour(col("ts")).alias("hour"),
        dayofmonth(col("ts")).alias("day"),
        weekofyear(col("ts")).alias("week"),
        month(col("ts")).alias("month"),
        year(col("ts")).alias("year"))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    song_data = input_data + "song-data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.join(
        df, song_df.artist_name == df.artist).withColumn(
            "songplay_id", monotonically_increasing_id()).withColumn(
                'start_time',
                to_timestamp(
                    date_format(
                        (col("ts") / 1000).cast(dataType=TimestampType()),
                        tsFormat),
                    tsFormat)).select("songplay_id", "start_time",
                                      col("userId").alias("user_id"), "level",
                                      "song_id", "artist_id",
                                      col("sessionId").alias("session_id"),
                                      col("artist_location").alias("location"),
                                      "userAgent",
                                      month(col("start_time")).alias("month"),
                                      year(col("start_time")).alias("year"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               "songplays")
def date_features(df, timestamp_column):

    import pyspark.sql.functions as F

    df = df.withColumn('date', F.to_date(
        F.col(timestamp_column), 'yyyy-mm-dd')).withColumn(
            'day_of_week',
            F.date_format(F.col(timestamp_column), 'u')).withColumn(
                'month',
                F.month('date')).withColumn('day_of_month',
                                            F.dayofmonth('date')).withColumn(
                                                'week_of_year',
                                                F.weekofyear('date'))

    df = df.withColumn(
        'weekend',
        F.when(F.col('day_of_week') == 1,
               'weekend').when(F.col('day_of_week') == 6, 'weekend').when(
                   F.col('day_of_week') == 7, 'weekend').otherwise('Weekday'))

    df = df.withColumn(
        'day',
        F.when(F.col('day_of_week') == 7, 'Saturday').when(
            F.col('day_of_week') == 2,
            'Monday').when(F.col('day_of_week') == 3, 'Tuesday').when(
                F.col('day_of_week') == 4,
                'Wednesday').when(F.col('day_of_week') == 5, 'Thursday').when(
                    F.col('day_of_week') == 6, 'Friday').otherwise('Sunday'))
    df = df.withColumn(
        'month_end',
        F.when(F.col('day_of_month') == 25, 'month_end').when(
            F.col('day_of_month') == 26,
            'month_end').when(F.col('day_of_month') == 27, 'month_end').when(
                F.col('day_of_month') == 28,
                'month_end').when(F.col('day_of_month') == 29,
                                  'month_end').when(
                                      F.col('day_of_month') == 30,
                                      'month_end').when(
                                          F.col('day_of_month') == 31,
                                          'month_end').
        when(F.col('day_of_month') == 1, 'month_end').when(
            F.col('day_of_month') == 2,
            'month_end').when(F.col('day_of_month') == 3, 'month_end').when(
                F.col('day_of_month') == 4,
                'month_end').when(F.col('day_of_month') == 5,
                                  'month_end').otherwise('not_month_end'))

    df = df.withColumn(
        'christmas',
        F.when(
            ((F.col('month') == 12) & (F.col('day_of_month') == 20)),
            'christmas').when(
                ((F.col('month') == 12) & (F.col('day_of_month') == 21)),
                'christmas').when(
                    ((F.col('month') == 12) & (F.col('day_of_month') == 22)),
                    'christmas').when(
                        ((F.col('month') == 12) &
                         (F.col('day_of_month') == 23)), 'christmas').when(
                             ((F.col('month') == 12) &
                              (F.col('day_of_month') == 24)),
                             'christmas').when(
                                 ((F.col('month') == 12) &
                                  (F.col('day_of_month') == 25)),
                                 'christmas').otherwise('not_christmas'))

    return df
# MAGIC
# MAGIC **`daily_hosts_df`**
# MAGIC
# MAGIC A DataFrame with two columns:
# MAGIC
# MAGIC | column  | explanation                                        |
# MAGIC | ------- | -------------------------------------------------- |
# MAGIC | `day`   | the day of the month                               |
# MAGIC | `count` | the number of unique requesting hosts for that day |

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import dayofmonth

day_to_host_pair_df = logs_df.select(logs_df.host, dayofmonth(logs_df.time).alias('day'))
day_group_hosts_df = day_to_host_pair_df.distinct()
daily_hosts_df = day_group_hosts_df.groupBy('day').count()
daily_hosts_df.cache()

print 'Unique hosts per day:'
daily_hosts_df.show(30, False)

# COMMAND ----------

# TEST Number of unique daily hosts (4c)
daily_hosts_list = (daily_hosts_df
                    .map(lambda r: (r[0], r[1]))
                    .take(30))

Test.assertEquals(day_to_host_pair_df.count(), total_log_entries, 'incorrect row count for day_to_host_pair_df')
#print not200DF
not200DF.show(10)
# Sorted DataFrame containing all paths and the number of times they were accessed with non-200 return code
logs_sum_df = not200DF.groupBy('path').count().sort('count',ascending=False)

print 'Top Ten failed URLs:'
logs_sum_df.show(10, False)

#Find Number of Unique Hosts
unique_host_count = logs_df.select('host').distinct().count()
print 'Unique hosts: {0}'.format(unique_host_count)

#Number of Unique Daily Hosts
from pyspark.sql.functions import dayofmonth

day_to_host_pair_df = logs_df.select('host',dayofmonth('time').alias('day'))
#day_to_host_pair_df.show(10,False)
day_group_hosts_df = day_to_host_pair_df.dropDuplicates()
#day_group_hosts_df.show(10,False)
daily_hosts_df = day_group_hosts_df.groupBy('day').count()

print 'Unique hosts per day:'
daily_hosts_df.show(30, False)
daily_hosts_df.cache()

#Visualizing the Number of Unique Daily Hosts
days_with_hosts =[]
hosts = []
for i in range (len(daily_hosts_df.collect())):
    days_with_hosts.append(daily_hosts_df.collect()[i][0])
    hosts.append(daily_hosts_df.collect()[i][1])
Beispiel #14
0
file3 = file3.withColumn('Age', split_col3.getItem(2))
file3 = file3.withColumn('Occupation', split_col3.getItem(3))
file3 = file3.withColumn('Zip-code', split_col3.getItem(4)).drop('_c0')
#file3.show()

#file4 = file1.join(file2, file1.MovieID == file2.MovieID,"inner")  #alternate of it.
file4 = file1.join(file2, ['MovieID'], "inner")
file5 = file4.na.drop()

split_date = f.split(file5['Timestamp'], ' ')
file6 = file5.withColumn('Date',
                         split_date.getItem(0).cast('date')).withColumn(
                             'Time', split_date.getItem(1))
file6 = file6.withColumn('Year', f.year(f.col('Date'))).withColumn(
    'month', f.month(f.col('Date'))).withColumn(
        'day', f.dayofmonth(f.col('Date'))).drop('Timestamp')
#print(file5.count())
#print(file4.count())

#......module1
f1 = file5.groupBy(
    'MovieID', 'Rating',
    'Title').count()  #how many times which movie get what rating
#for i in f1.take(10):
#	print(i)
#f1.orderBy(f.desc('Rating'),f.desc('count')).show(10,truncate=False)
f1 = f1.orderBy(f.desc('Rating'), f.desc('count')).limit(10).toPandas()
print('converted into pandas')
#f2=f1.orderBy(f.desc('Rating'),f.desc('count')).toPandas()
plt.rcParams["figure.figsize"] = [30, 25]
f1.plot(x="Title", y='count', kind="bar")
Beispiel #15
0
def process_log_data(spark, input_data, output_data):
    """
    Description: This function is used to read the log data in the
        filepath (bucket/log_data) to get the information needed to populate the
        dimensional tables (user, time and song) as well as the songplays fact table.

    Parameters:
        spark: the cursor object.
        input_path: path to the bucket containing song data.
        output_path: path to destination bucket where  parquet files will be saved.

    Returns:
        None
    """
    print('Begin processing log data')

    # get filepath to log data file
    # log_data = input_data + 'log-data/*/*/*.json'
    log_data = input_data + 'log-data/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        ["userId", "firstName", 'lastName', 'location',
         'gender']).dropDuplicates()

    print('Save users table')
    # write users table to parquet files
    users_table.write.save(output_data + 'users_table',
                           format='parquet',
                           mode='append')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000.0)
    df = df.withColumn("timestamp", get_timestamp("ts"))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("date_sp", get_datetime("timestamp"))

    # extract columns to create time table
    time_table = df.withColumn("hour", hour(df.date_sp)) \
    .withColumn("year", year(df.date_sp)) \
    .withColumn("day", dayofmonth(df.date_sp)) \
    .withColumn("week", weekofyear(df.date_sp)) \
    .withColumn("month", month(df.date_sp)) \
    .withColumn("weekday", dayofweek(df.date_sp)) \
    .withColumnRenamed('ts', 'start_time') \
    .select(['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']) \
    .dropDuplicates()

    print('Save time table')
    # write time table to parquet files partitioned by year and month
    time_table.repartition("year", "month").write.mode("append").partitionBy(
        "year", "month").parquet(output_data + 'time_table')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song-data/*/*/*/*.json')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, [df.song == song_df.title, df.length == song_df.duration, df.artist == song_df.artist_name]) \
    .select(df.ts, df.userId, df.level, song_df.song_id, song_df.artist_id, df.sessionId, df.location, df.userAgent, df.date_sp) \
    .withColumn("year", year(df.date_sp)) \
    .withColumn("month", month(df.date_sp))

    print('Save songplays table - Fact Table')
    # write songplays table to parquet files partitioned by year and month
    songplays_table.repartition("year",
                                "month").write.mode("append").partitionBy(
                                    "year", "month").parquet(output_data +
                                                             'songplays_table')
    print('Completed.')
Beispiel #16
0
def process_log_data(spark, input_data, output_data):
    '''
            Description: Processes the log data stored in JSON-files via Spark and stores them in parquet files in an S3 container
            
            Parameters:
                    spark (handle): handle to Spark Session
                    input_data (string): path to Input directory on S3
                    output_data (string): path to Output directory on S3
                    
            Returns:
                    -
    '''

    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')
    #log_data=  os.path.join(input_data, "*.json") #debug using local files

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level').distinct()

    # write users table to parquet files
    users_table.write.parquet(f'{output_data}/users_table', mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000),
                        TimestampType())
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: to_date(x), TimestampType())
    df = df.withColumn('datetime', get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select("ts","timestamp").drop_duplicates() \
                    .withColumn("hour", hour(col('timestamp'))) \
                    .withColumn("day", dayofmonth(col('timestamp'))) \
                    .withColumn("week", weekofyear(col('timestamp'))) \
                    .withColumn("month", month(col('timestamp'))) \
                    .withColumn("year", year(col('timestamp'))) \
                    .withColumn("weekday", date_format(col('timestamp'),'E'))

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(f'{output_data}/time_table',
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.read.parquet(f'{output_data}/songs_table')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
                        .select(monotonically_increasing_id().alias("songplay_id"),"ts",col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")) \
                        .join(time_table, df.ts == time_table.ts, how="inner")\
                        .select("songplay_id", "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") \
                        .drop_duplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(f'{output_data}/songplays_table',
                                  mode='overwrite',
                                  partitionBy=["year", "month"])
Beispiel #17
0
taxi_year = udf(taxi_year)

## Yellow Taxi Data 2011

## read data and register as sql df
taxi_2011 =  spark.read.format("csv").options(header="true",\
inferschema="true").load(sys.argv[1])
taxi_2011.createOrReplaceTempView("taxi_2011")

## breakdown timestamp
taxi_2011 = taxi_2011.withColumn("tripyear",
                                 year(taxi_2011.Trip_Pickup_DateTime))
taxi_2011 = taxi_2011.withColumn("tripmonth",
                                 month(taxi_2011.Trip_Pickup_DateTime))
taxi_2011 = taxi_2011.withColumn("tripday",
                                 dayofmonth(taxi_2011.Trip_Pickup_DateTime))
taxi_2011.createOrReplaceTempView("taxi_2011")

## group by day
taxi_daygroups2011 = spark.sql("SELECT * FROM taxi_2011").\
groupby("tripyear","tripmonth","tripday").count()
taxi_daygroups2011.createOrReplaceTempView("taxi_daygroups2011")
## sort by day
taxi_daygroups2011 = spark.sql("SELECT * FROM taxi_daygroups2011").\
orderBy("tripyear","tripmonth","tripday")
taxi_daygroups2011.createOrReplaceTempView("taxi_daygroups2011")

## Yellow Taxi Data 2012

## read data and register as sql df
taxi_2012 =  spark.read.format("csv").options(header="true",\
Beispiel #18
0
def process_log_data(spark, input_data, output_data):
    """Function to read source log files from S3 and output 
       parquet files for users, time and songplays back on S3
    
    Args:
        spark: to read files with spark
        input_data: source location for S3 Bucket
        output_data: destination location for S3 Bucket
    
    Output Files:
        s3://output-datalakes/users/users.parquet
        s3://output-datalakes/time/time.parquet
        s3://output-datalakes/songplays/songplays.parquet
    """
    # get filepath to log data file
    log_data =os.path.join(input_data,"log_data/")

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays and also create songplay_id incremental key
    df = df.filter(df['page'] == "NextSong").withColumn('songplay_id', monotonically_increasing_id())

    # extract columns for users table    
    users_table = df.select('userid', \
                            'firstName', \
                            'lastName', \
                            'gender', \
                            'level').dropDuplicates()
    
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data,"users/","users.parquet"))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: int(x/1000))
    df = df.withColumn('start_time',from_unixtime(get_timestamp(df['ts'])))
    
    # extract columns to create time table
    time_table = df.select('start_time') \
                   .withColumn('hour',hour(df['start_time'])) \
                   .withColumn('day',dayofmonth(df['start_time'])) \
                   .withColumn('week',weekofyear(df['start_time'])) \
                   .withColumn('month',month(df['start_time'])) \
                   .withColumn('year',year(df['start_time'])) \
                   .withColumn('weekday',dayofweek(df['start_time'])) \
                   .dropDuplicates()
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year","month").parquet(os.path.join(output_data,"time/","time.parquet"))

    # read in song data to use for songplays table
    song_df = spark.read.json(os.path.join(input_data, "song_data/A/A/A"))

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.join(song_df,(df.song == song_df.title) & (df.artist == song_df.artist_name),'left_outer') \
                        .select(df.songplay_id, \
                                df.start_time, \
                                df.userId, \
                                df.level, \
                                song_df.song_id, \
                                song_df.artist_id, \
                                df.sessionId, \
                                df.location, \
                                df.userAgent) \
                         .dropDuplicates()
    songplays_table.show()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("start_time").parquet(os.path.join(output_data,"songplays/","songplays.parquet"))
def process_log_data(spark, input_data, output_data):
    """
    Loads log data from S3 and transform them into users, time and songplays table,
    and write them on the sparkify S3
    
    Arguments:
        spark {object}: spark session
        input_data {string}: a filepath to S3 where contains log data
        output_data {string}: a filepath to sparkify S3
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.filter(df.userId != '').selectExpr("userId as user_id",
        "firstName as first_name", "lastName as last_name", "gender", "level") \
        .dropDuplicates()

    # output filepath to users table file
    users_table_path = output_data + "users_table.parquet"

    # write users table to parquet files
    users_table.write.mode("overwrite") \
        .parquet(users_table_path)

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("start_time", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d'))
    df = df.withColumn("datetime", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select(
        "start_time",
        hour("start_time").alias("hour"),
        dayofmonth("datetime").alias("day"),
        weekofyear("datetime").alias("week"),
        month("datetime").alias("month"),
        year("datetime").alias("year"),
        dayofweek("datetime").alias("weekday")).dropDuplicates()

    # output filepath to time table
    time_table_path = output_data + "time_table.parquet"

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").mode("overwrite") \
        .parquet(time_table_path)

    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    # read in song data to use for songplays table
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, (df.song == song_df.title) &
                              (df.length == song_df.duration) &
                              (df.artist == song_df.artist_name),
                              how='left').dropDuplicates()
    songplays_table = songplays_table.withColumn("id",
                                                 monotonically_increasing_id())
    windowSpec = Window.orderBy("id")
    songplays_table.withColumn("songplay_id", row_number().over(windowSpec))
    songplays_table = songplays_table.selectExpr(
        "songplay_id", "start_time", "userId as user_id", "level", "song_id",
        "artist_id", "sessionId as session_id", "location",
        "userAgent as user_agent", "year(start_time) as year",
        "month(start_time) as month")

    # output filepath to songplays table
    songplays_table_path = output_data + "songplays_table.parquet"

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").mode("overwrite") \
        .parquet(songplays_table_path)
Beispiel #20
0
def process_log_data(spark, input_data, output_data, songs_data):
    """ Process log_data json files which located in S3
        Create table users, time and song_plays
        songs_data will be needed in creation song_plays table
        Store the table in parque format in S3
    
    Args:
      spark                           : Spark Session
      input_data  (string)            : location json files (input)
      output_data (string)            : location parque files (output)
      songs_data  (Spark Dataframe)   : Song Data tables
      
    Returns:
      None
      
    
    """

    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # set schema log data
    logSchema = StructType([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Int()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str()),
    ])

    # read log data file
    df = spark.read.json(log_data, schema=logSchema)

    # filter by actions for song plays
    df = df.filter("page == 'NextSong'")

    # create temporary View for Log
    df.createOrReplaceTempView("logView")

    # extract columns for users table
    users_table = spark.sql("""
        WITH latestChange AS (
            SELECT userId AS userIdLatest,
                   MAX(ts) AS maxTs
            FROM logView
            GROUP BY userId
        )
        SELECT userId AS user_id,
               ts AS tsTemp,
               firstName AS first_name,
               lastName  AS last_name,
               gender,
               level
        FROM logView AS t1
        JOIN latestChange AS t2 
        ON t1.userId = t2.userIdLatest AND t1.ts = t2.maxTs  
        WHERE userId IS NOT NULL
        """).dropDuplicates(['user_id']).drop("tsTemp")

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: getDateTime(int(x)), TST())
    df = df.withColumn("start_time", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: getDateTime(int(x)), Dat())
    df = df.withColumn("date_time", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select(col("start_time"),
                           hour(df.start_time).alias("hour"),
                           dayofmonth(df.date_time).alias("day"),
                           weekofyear(df.date_time).alias("week"),
                           month(df.date_time).alias("month"),
                           year(df.date_time).alias("year"),
                           date_format(df.date_time,"W").alias("weekday")  ) \
                      .where("start_time is not null") \
                      .dropDuplicates(['start_time'])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, 'time'), 'overwrite')

    # create temporary View for Log and Song tables
    df.createOrReplaceTempView("logView")
    songs_data.createOrReplaceTempView("songView")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""
        SELECT start_time,
               year(date_time) AS year,
               month(date_time) AS month,
               userId AS user_id,
               level,
               song_id,
               artist_id,
               sessionId AS session_id,
               location,
               userAgent AS user_agent 
        FROM logView AS t1
        JOIN songView AS t2
        ON  (t1.artist = t2.artist_name)
        AND (t1.song   = t2.title)
        AND (t1.length = t2.duration)
        """)

    songplays_table.show(10)
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, 'songplays'), 'overwrite')
Beispiel #21
0
data = data.fillna(0)
data = data.rdd
parse_time = udf(lambda time:dt.strptime(time, '%m/%d/%Y %I:%M:%S %p'))

crime_data = data.map(lambda d:(int(d[0]),d[1],parse(d[2]),d[3],d[4],d[5],d[6],d[7],d[8],d[9],int(d[10]),int(d[11]),int(d[12]),
int(d[13]),d[14],int(d[15]),
int(d[16]),int(d[17]),parse(d[18]),float(d[19]),float(d[20]),d[21]))


crime_df = sqlContext.createDataFrame(crime_data,["ID","Case Number","Date","Block","IUCR","Primary Type","Description",
"Location Description","Arrest","Domestic","Beat","District","Ward","Community Area","FBI Code","X Coordinate",
"Y Coordinate","Year","Updated On","Latitude","Longitude","Location"])


#Reduced level dataset for analysis
crime_detail = crime_df.select(year(crime_df.Date).alias("Year"),month(crime_df.Date).alias("Month"),dayofmonth(crime_df.Date).alias("DoM"),date_format(crime_df.Date, 'EEEE').alias("DoW"),hour(crime_df.Date).alias("Hour"),crime_df.Block,crime_df["Primary Type"].alias("CrimeType"),crime_df.Description,crime_df["Location Description"].alias("LocDesc"), crime_df.Arrest,crime_df.Domestic,crime_df.Beat,crime_df.District,crime_df.Ward,crime_df["Community Area"].alias("CommunityArea"),crime_df.Latitude,crime_df.Longitude,crime_df.Location)

crime_detail.registerTempTable("CrimeDetails")

#Top level analysis for the chicago crimes dataset

#print crime_detail.printSchema()


print "Total Records:           %d" % (crime_detail.count())
print "Distinct Year:           %d" % (crime_detail.select('Year').distinct().count())
print "Distinct Hours:          %d" % (crime_detail.select('Hour').distinct().count())
print "Distinct Type of crimes: %d" % (crime_detail.select('CrimeType').distinct().count())
print "Distinct Desc:           %d" % (crime_detail.select('Description').distinct().count())
print "Distinct Blocks:         %d" % (crime_detail.select('Block').distinct().count())
print "Distinct Loc Desc:       %d" % (crime_detail.select('LocDesc').distinct().count())
Beispiel #22
0
    SumLoad1 peakLoad1, SumLoad2 peakLoad2, SumLoad3 peakLoad3,
    SumLoad4 peakLoad4, SumLoad5 peakLoad5, SumLoadSecure peakLoadSecure
    FROM aggregatemaxdf
"""

featureeddf = spark.sql(sqlStatement)

# Extract some time features from "SessionStartHourTime" column
from pyspark.sql.functions import year, month, dayofmonth, hour

featureeddf = featureeddf.withColumn('year',
                                     year(featureeddf['SessionStartHourTime']))
featureeddf = featureeddf.withColumn(
    'month', month(featureeddf['SessionStartHourTime']))
featureeddf = featureeddf.withColumn(
    'dayofmonth', dayofmonth(featureeddf['SessionStartHourTime']))
featureeddf = featureeddf.withColumn('hourofday',
                                     hour(featureeddf['SessionStartHourTime']))
featureeddf.write.mode('overwrite').partitionBy("dayofmonth").parquet(
    HourlyDFFile)

# add day feature
day = 3600 * 24
day_window = F.from_unixtime(
    F.unix_timestamp('SessionStartHourTime') -
    F.unix_timestamp('SessionStartHourTime') % day)
featureeddf = featureeddf.withColumn('SessionStartDay', day_window)

# aggregate daily
featureeddf.createOrReplaceTempView("featureeddf")
sqlStatement = """
Beispiel #23
0
def process_log_data(spark, input_data, output_data):
    """
    This function uses the spark instance, reads the data from the s3 bucket and convert it into a spark dataframe.
    
    input: json file path for the log files from s3 bucket
    
    output: 
        
    1. parquet files for users table 
    
    2. parquet files for time table partitioned by 'year' and 'month' 
    
    3. parquet files for songplays table partitioned by 'year' and 'month' 
    
    """

    #get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*/*/*.json")

    # read log data file
    df_log = spark.read.json(log_data)

    # filter by actions for song plays
    df_log = df_log[df_log['page'] == 'NextSong']

    #extract columns for users table
    columns = df_log['userId', 'firstName', 'lastName', 'gender', 'ts',
                     'level']
    users_table = columns.selectExpr("userId as user_id",
                                     "firstName as frist_name",
                                     "lastName as last_name", 'gender',
                                     'level')

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite')

    #converting the time variable ts into timestamp
    df_log = df_log.withColumn("timestamp", (col("ts").cast('bigint') /
                                             1000).cast("timestamp"))

    # extract columns to create time table
    time_table = df_log.select(
        'timestamp',
        hour('timestamp').alias('hour'),
        dayofmonth('timestamp').alias('day'),
        weekofyear('timestamp').alias('week'),
        month('timestamp').alias('month'),
        year('timestamp').alias('year'),
        date_format('timestamp', 'EEEE').alias('day_of_week'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time_df'), 'overwrite')

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song_data/A/A/A/*.json")
    df_song = spark.read.json(song_data)

    #extract columns from joined song and log datasets to create songplays table
    songplays_table = df_log.join(df_song, (df_log.length == df_song.duration) &
                                  (df_log.artist == df_song.artist_name) &
                                  (df_log.song == df_song.title), 'left')\
                                  .select(col('userId').alias('user_Id'),
                                          df_log.location,
                                          col('userAgent').alias('user_agent'),
                                          col('sessionId').alias('session_id'),
                                          df_song.artist_id,
                                          df_song.song_id,
                                          df_log.level,
                                          df_log.timestamp,
                                          year('timestamp').alias('year'),
                                          month('timestamp').alias('month'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays'), 'overwrite')
Beispiel #24
0
def process_log_data(spark, input_data, output_data):
	"""
	Read in log data from S3 to populate the users and time dimesion tables and the songplays fact table.
	Write to S3 parquet files representing the tables.
	:param spark: SparkSession object
	:param input_data: S3 path containing input song data to process
	:param output_data: S3 path to write parquet files
	:return: None
	"""
	# logs staging dataframe
	log_data = input_data + "log_data/*/*/*.json"
	log_schema = StructType([
		StructField("artist", StringType()),
		StructField("auth", StringType()),
		StructField("firstName", StringType()),
		StructField("gender", StringType()),
		StructField("itemInSession", IntegerType()),
		StructField("lastName", StringType()),
		StructField("length", DoubleType()),
		StructField("level", StringType()),
		StructField("location", StringType()),
		StructField("method", StringType()),
		StructField("page", StringType()),
		StructField("registration", DoubleType()),
		StructField("sessionId", IntegerType()),
		StructField("song", StringType()),
		StructField("status", IntegerType()),
		StructField("ts", LongType()),
		StructField("userAgent", StringType()),
		StructField("userId", StringType()),
	])
	df = spark.read.json(log_data, schema=log_schema)
	df = df.where("page = 'NextSong'")

	# users dimension table
	# user_id, first_name, last_name, gender, level
	users_table = df.withColumn('max_ts', max('ts').over(Window.partitionBy('userId'))).where(col('ts') == col('max_ts')).drop('max_ts')
	users_table = users_table.selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level")
	#users_table.write.parquet(output_data + "users")

	# time dimension table
	# start_time, hour, day, week, month, year, weekday
	get_timestamp = udf(lambda x: round(x / 1000), LongType())
	df = df.withColumn("timestamp", get_timestamp(df.ts))
	get_datetime =	udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0), TimestampType())
	df = df.withColumn("datetime", get_datetime(df.ts))
	time_table = df.select("timestamp", "datetime").distinct()
	time_table = time_table.withColumn("hour", hour("datetime"))
	time_table = time_table.withColumn("day", dayofmonth("datetime"))
	time_table = time_table.withColumn("week", weekofyear("datetime"))
	time_table = time_table.withColumn("month", month("datetime"))
	time_table = time_table.withColumn("year", year("datetime"))
	time_table = time_table.withColumn("weekday", date_format("datetime", "u").cast(IntegerType()))
	time_table = time_table.drop("datetime")
	#time_table.write.partitionBy("year", "month").parquet(output_data + "time")

	# songplays fact table
	# songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
	songs_df = spark.read.parquet(output_data + "songs")
	artists_df = spark.read.parquet(output_data + "artists")
	songplays_df = df.join(songs_df, df.song == songs_df.title, how="left")
	songplays_df = songplays_df.drop("artist_id", "year", "location")
	songplays_df = songplays_df.join(artists_df, songplays_df.artist == artists_df.name, how="left")
	songplays_df = songplays_df.withColumn("songplay_id", monotonically_increasing_id())
	songplays_df = songplays_df.withColumn("year", year("datetime"))
	songplays_df = songplays_df.withColumn("month", month("datetime"))
	songplays_table = songplays_df.selectExpr("songplay_id","timestamp AS start_time", "userId AS user_id", "level", "song_id", \
												"artist_id", "sessionId AS session_id", "location", "userAgent AS user_agent", \
												 "year", "month")
	songplays_table.write.partitionBy("year", "month").parquet(output_data + "songplays")
Beispiel #25
0
Datei: etl.py Projekt: rfks/DEND
def process_log_data(spark, input_data, output_data):
    """
        This module processes the log_data from s3 and saves users and time details as parquet files.
        After that it combines the event logs with the song_data and creates the songplays output file also in parquet format.
    """

    # get filepath to log data file
    log_data = input_data + 'log_data'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter('page=="NextSong"')

    # extract columns for users table
    users_table = df.select('userid', 'firstname', 'lastname', 'gender',
                            'level').distinct()

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users.parquet')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0),
                        TimestampType())
    df = df.withColumn('timestamp', get_timestamp('ts'))

    # create datetime column from original timestamp column -- Don't need this
    #get_datetime = udf()
    #df =

    # extract columns to create time table
    time_table = df.select('timestamp') \
        .withColumnRenamed('timestamp','start_time') \
        .distinct() \
        .withColumn('hour',hour(col('start_time'))) \
        .withColumn('day',dayofmonth(col('start_time'))) \
        .withColumn('week',weekofyear(col('start_time'))) \
        .withColumn('month',month(col('start_time'))) \
        .withColumn('year',year(col('start_time'))) \
        .withColumn('weekday',dayofweek(col('start_time')))

    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'time.parquet')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song_data/*/*/*')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df,(df.song == song_df.title) & (df.length == song_df.duration) & (df.artist == song_df.artist_name),'inner') \
        .withColumn('songplays_id',monotonically_increasing_id()) \
        .withColumn('start_time',get_timestamp('ts')) \
        .select('songplays_id','start_time','userid','level','song_id','artist_id','sessionid','location','useragent') \
        .withColumnRenamed('userid','user_id') \
        .withColumnRenamed('sessionid','session_id') \
        .withColumnRenamed('useragent','user_agent') \
        .withColumn('month',month(col('start_time'))) \
        .withColumn('year',year(col('start_time')))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'songplays.parquet')
Beispiel #26
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*.json")
    song_data = os.path.join(input_data, "song_data", "*", "*", "*", "*.json")
    
    users_table_path = os.path.join(output_data, "users-table")
    time_table_path = os.path.join(output_data, "time-table")
    songsplay_table_path = os.path.join(output_data, "songsplay-table")
    
    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(F.col('page') == 'NextSong')

    # extract columns for users table    
    users_table = df.select(F.col("userId").alias("user_id"), 
                            F.col("firstName").alias("first_name"),
                            F.col("lastName").alias("last_name"),
                            F.col("gender"),
                            F.col("level"))
    
    # write users table to parquet files
    users_table.write.mode(mode).parquet(users_table_path)
    print("users_table saved")

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType())
    df = df.withColumn("ts_timestamp", get_timestamp(F.col("ts")))
    
    # extract columns to create time table
    time_table = df.select(F.col("ts_timestamp").alias("start_time")).distinct()
    time_table = time_table.select(F.col("start_time"), 
                                   F.hour(F.col("start_time")).alias("hour"), 
                                   F.dayofmonth(F.col("start_time")).alias("day"), 
                                   F.weekofyear(F.col("start_time")).alias("week"), 
                                   F.month(F.col("start_time")).alias("month"), 
                                   F.year(F.col("start_time")).alias("year"),
                                   F.dayofweek(F.col("start_time")).alias("weekday"))
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").mode(mode).parquet(time_table_path)
    print("time_table saved")
    # read in song data to use for songplays table
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, 
                              (df.artist == song_df.artist_name)&\
                              (df.length == song_df.duration)&\
                              (df.song == song_df.title),
                              how="inner")
    songplays_table = songplays_table.select(F.monotonically_increasing_id().alias("songplay_id"), 
                                             F.col("ts_timestamp").alias("start_time"), 
                                             F.col("userId").alias("user_id"), 
                                             F.col("level"), 
                                             F.col("song_id"),
                                             F.col("artist_id"), 
                                             F.col("sessionID").alias("session_id"), 
                                             F.col("location"),
                                             F.col("userAgent").alias("user_agent"),
                                             F.month(F.col("ts_timestamp")).alias("month"),
                                             F.year(F.col("ts_timestamp")).alias("year"))
    # Missing drop duplicates

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").mode(mode).parquet(songsplay_table_path)
    print("songplays_table saved")
Beispiel #27
0
def process_log_data(spark, input_data, output_data):
    ''' Read log data and write users, time and songplays tables.

    Log data is read from any json files found under `input_data`/log_data.

    Data can be read and written from/to local files or S3 buckets ('s3a://').
    Output data is written as parquet files.
    '''
    # get filepath to log data file
    if input_data.startswith('s3a://'):
        # we are reading data from S3
        log_data = list_matching_in_bucket(input_data, 'log_data/')
    else:
        # we are reading local files
        import glob
        glob_pattern = "{}/log_data/*/*/*.json".format(input_data)
        log_data = glob.glob(glob_pattern)
        if 0 == len(log_data):
            print("[ERROR] could not find any log data files:'{}'".format(
                glob_pattern))
            exit(0)

    # read log data file
    df = spark.read.json(log_data)
    print("[INFO] read {} events".format(df.count()))
    print("[INFO] detected schema:")
    df.printSchema()
    df.show(5)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')
    print("[INFO] selected {} 'NextSong' events".format(df.count()))

    # extract columns for users table
    # Note: first sort and then selectively drop duplicates to only store
    # the most recent user `level`.
    users_table = df.sort('ts', ascending=False).select(
        df.userId.alias('user_id'), df.firstName.alias('first_name'),
        df.lastName.alias('last_name'), df.gender, df.level).dropDuplicates(
            ['user_id', 'first_name', 'last_name', 'gender'])
    print("[INFO] saving information for {} users".format(users_table.count()))
    print("[INFO] users_table schema:")
    users_table.printSchema()

    # write users table to parquet files
    users_table.write.parquet("{}/users".format(output_data), mode='overwrite')

    # create datetime column from original timestamp column
    import pyspark.sql.types as pstypes
    get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                       pstypes.TimestampType())
    df = df.withColumn('datetime', get_datetime(df.ts))
    # df.printSchema()
    # df.show(2)

    # extract columns to create time table
    time_table = df.select(df.datetime.alias('start_time'),
                           hour(df.datetime).alias('hour'),
                           dayofmonth(df.datetime).alias('day'),
                           weekofyear(df.datetime).alias('week'),
                           month(df.datetime).alias('month'),
                           year(df.datetime).alias('year'),
                           date_format(df.datetime,
                                       'E').alias('weekday')).dropDuplicates()
    print("[INFO] saving information for {} timestamps".format(
        time_table.count()))
    print("[INFO] time_table schema:")
    time_table.printSchema()
    time_table.show(5)

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet("{}/times".format(output_data),
                             partitionBy=['year', 'month'],
                             mode='overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.parquet("{}/songs".format(output_data))
    artist_df = spark.read.parquet("{}/artists".format(output_data))

    # extract columns from joined song and log datasets to create songplays
    # table
    songplays_table = df.join(
        artist_df, artist_df.name == df.artist,
        'inner').join(song_df, [
            song_df.artist_id == artist_df.artist_id,
            song_df.title == df.song,
            song_df.duration == df.length,
        ], 'inner').select(
            monotonically_increasing_id().alias('songplay_id'),
            df.datetime.alias('start_time'),
            df.userId.alias('user_id'),
            df.level.alias('level'),
            song_df.song_id,
            artist_df.artist_id,
            df.sessionId.alias('session_id'),
            df.location.alias('location'),
            df.userAgent.alias('user_agent'),
            # needed for writing the tables partitioned
            month(df.datetime).alias('month'),
            year(df.datetime).alias('year'),
        )
    print("[INFO] saving information for {} songplays".format(
        songplays_table.count()))
    print('[INFO] songplays_table schema:')
    songplays_table.printSchema()
    songplays_table.show(5)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet("{}/songplays".format(output_data),
                                  partitionBy=['year', 'month'],
                                  mode='overwrite')
# |   GOOG|    Sam|200.0|
# |   APPL|  Linda|130.0|
# |   MSFT|    Amy|124.0|
# |   GOOG|Charlie|120.0|
# +-------+-------+-----+

# MISSING DATA

# Display rows with at least 2 non-null values
df.na.drop(thresh=2).show()
# get rows with no null value
df.na.drop(how='any').show()
# don't drop any row
df.na.drop(how='all').show()
# drop row will null data in Sales
df.na.drop(subset=['Sales']).show()
# Fill in any string value
df.na.fill('FILL VALUE').show()
# Fill in any null num value
df.na.fill(0).show()
# Fill all null in Name column
df.na.fill('No Name', subset=['Name']).show()

mean_val = df.select(mean(df['Sales'])).collect()
mean_sales = mean_val[0][0]
df.na.fill(mean_sales, subset=['Sales']).show()

# TIMESTAMP

df.select(dayofmonth(df['Date'])).show()
Beispiel #29
0
# ----------------

error_df = logs_df.filter(logs_df['status'] == '200')
error_paths_df = error_df.groupBy('path').count().sort('count', ascending=False)
# error_paths_df.show(10, truncate=False)

# Unique Hosts Count
# -------------------

# unique_hosts_df = logs_df.dropDuplicates(['host']).count()
# print(unique_hosts_df)

# Unique host count by day
# -------------------------

# logs_df.show(10, truncate=False)
day_to_host_df = logs_df.select('host', dayofmonth('time').alias('day')).sort('day', ascending=False)
# day_to_host_df.show(10, truncate=False)
day_to_host_unique_df = day_to_host_df.dropDuplicates()
daily_hosts_df = day_to_host_unique_df.groupBy('day').count()
# daily_hosts_df.show(31, truncate=False)
daily_hosts_df.cache()

# Average requests per day
total_req_per_day_df = logs_df.groupBy(dayofmonth('time').alias('day')).count()
# total_req_per_day_df.show(31)

avg_re_per_day_df = total_req_per_day_df.join(daily_hosts_df, ['day']).\
    select('day', (total_req_per_day_df['count']/daily_hosts_df['count']).cast('integer').alias('Average_Requests'))

avg_re_per_day_df.show(31, truncate=False)
Beispiel #30
0
def process_log_data(spark: SparkSession, input_data: str,
                     output_data: str) -> None:
    """
    Given an input path to log data, select relevant columns for user and time tables and save those to disk
    respecting an output path. Then load previously processed song and artist data, join it with log data, create
    the songplay table and write it to disk.
    :param spark: SparkSession
    :param input_data: Path to input data
    :param output_data: Path to store output data
    :return: None
    """
    # get filepath to log data file
    log_data = f"{input_data}/log_data/*/*/"

    # read log data file
    print("Loading log data")
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    user_table = df.dropDuplicates(["userId"]).select(
        ["userId", "firstName", "lastName", "gender", "level"])

    # write users table to parquet files
    print("Writing user table")
    user_table.write.parquet(f"{output_data}/users/", mode="overwrite")

    # create timestamp column from original timestamp column
    df = df.withColumn("ts", from_unixtime(df.ts / 1000))

    # extract columns to create time table
    time_table = df.dropDuplicates(["ts"]).select([
        "ts",
        hour(df.ts).alias("hour"),
        dayofmonth(df.ts).alias("day"),
        weekofyear(df.ts).alias("week"),
        month(df.ts).alias("month"),
        year(df.ts).alias("year"),
        dayofweek(df.ts).alias("weekday")
    ])

    # write time table to parquet files partitioned by year and month
    print("Writing time table")
    time_table.write.partitionBy(["year",
                                  "month"]).parquet(f"{output_data}/time/",
                                                    mode="overwrite")

    # read in song and artist data required for songplays table
    print("Loading song data")
    songs_table = spark.read.parquet(f"{output_data}/songs/")

    print("Loading artist data")
    artists_table = spark.read.parquet(f"{output_data}/artists/")

    # join datasets
    print("Joining song, artist and log data")
    join_cond = [
        df.song == songs_table.title, df.artist == artists_table.artist_name,
        df.length == songs_table.duration
    ]

    joined_df = songs_table.join(artists_table, "artist_id",
                                 "inner").join(df, join_cond, "inner")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = joined_df \
        .withColumn("songplay_id", monotonically_increasing_id()) \
        .withColumn("year", year(df.ts).alias("year")) \
        .withColumn("month", month(df.ts).alias("month")) \
        .select(["songplay_id", "ts", "userId", "level", "song_id", "artist_id", "sessionId",
                 "artist_location", "userAgent", "year", "month"])

    # write songplays table to parquet files partitioned by year and month
    print("Writing songplay table")
    songplays_table.write.partitionBy(["year", "month"
                                       ]).parquet(f"{output_data}/songplays/",
                                                  mode="overwrite")
    print("Finished processing log data")
def etl_world_temperature(spark, input_dir, output_dir):
    """Clean the temperature data"""

    # load data
    data_input_full_file_path = f'{input_dir}/GlobalLandTemperaturesByCity.csv'
    world_temperature_spark_df = spark.read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
        .load(data_input_full_file_path)

    # just take temperature data after 2003-01-01 and only keep the US data
    world_temperature_spark_df = world_temperature_spark_df \
        .filter(F.col('dt') >= datetime(2003, 1, 1)) \
        .filter(F.col('Country') == 'United States')

    # parse month and day
    us_temperature_spark_df = world_temperature_spark_df \
        .withColumn('month', F.month(F.col('dt'))) \
        .withColumn('day', F.dayofmonth(F.col('dt'))) \
        .drop(F.col('dt'))

    # groupby columns and get the new avg temperature
    avg_us_temperature_spark_df = us_temperature_spark_df \
        .groupBy(['month', 'day', 'City', 'Country', 'Latitude', 'Longitude']) \
        .agg(F.mean('AverageTemperature')) \
        .withColumnRenamed('avg(AverageTemperature)', 'AverageTemperature') \
        .withColumn('month', F.col('month').cast('integer')) \
        .withColumn('day', F.col('day').cast('integer'))

    # covert DMS Lat and Lon to numeric format to get state info with an udf func
    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .withColumn('Latitude', F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double'))
                    .otherwise(-1*F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double')))) \
        .withColumn('Longitude', F.when(F.col('Longitude').rlike('W'), -1 * F.regexp_replace('Longitude', 'W', '').cast('double'))
                    .otherwise(F.when(F.col('Longitude').rlike('W'), F.regexp_replace('Longitude', 'W', '').cast('double'))))

    # define a udf function to get state based on lat and lon by using reverse_geocoder library
    # https://github.com/thampiman/reverse-geocoder
    def _helper_get_state_(lat, lon):

        coor = (lat, lon)
        result = rg.search(coor)

        return result[0].get('admin1')

    _helper_get_state_udf = F.udf(lambda x, y: _helper_get_state_(x, y),
                                  StringType())

    avg_us_temperature_spark_df = avg_us_temperature_spark_df\
        .withColumn('state', _helper_get_state_udf(F.col('Latitude'), F.col('Longitude')))

    # load i94addr dictionary - map the i94addr values
    i94addr_dictionary_input_full_file_path = f'{input_dir}/dictionary_data/i94addr_dictionary.csv'
    i94addr_dictionary_spark_df = spark \
        .read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
        .load(i94addr_dictionary_input_full_file_path)

    i94addr_dictionary_spark_df = i94addr_dictionary_spark_df \
        .withColumn('init_cap_value', F.initcap(F.col('value')))

    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .join(i94addr_dictionary_spark_df, avg_us_temperature_spark_df.state == i94addr_dictionary_spark_df.init_cap_value, 'left') \
        .drop('init_cap_value') \
        .drop('value') \
        .withColumnRenamed('key', 'state_code')

    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .withColumnRenamed("Country", "country") \
        .withColumnRenamed("City", "city") \
        .withColumnRenamed("Latitude", "latitude") \
        .withColumnRenamed("Longitude", "longitude") \
        .withColumnRenamed("AverageTemperature", "avg_temperature")

    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code'))))

    avg_us_temperature_spark_df = avg_us_temperature_spark_df.select(
        'month', 'day', 'city', 'state', 'state_code', 'city_state_code',
        'avg_temperature').distinct()

    # output clean data
    data_output_full_file_path = f'{output_dir}/USCitiesTemperaturesByMonth.parquet'
    avg_us_temperature_spark_df \
        .write \
        .options(encoding="ISO-8859-1") \
        .mode('overwrite') \
        .parquet(data_output_full_file_path)
def prepare_dwh_data():
    df_dwh_fact_hotel = \
        spark.read.parquet(config.get(config_set, 'dwh.fact.hotel.path')) \
            .filter('start_date >= "2016-01-01"') \
            .select('fact_hotel_id',
                    'start_date',
                    'end_date',
                    'effective_date',
                    'dim_booking_id',
                    'dim_supplier_id',
                    'dim_traveler_profile_id',
                    F.col('issuing_country_id').alias('dim_location_id'))
    df_dwh_dim_booking = \
        spark.read.parquet(config.get(config_set, 'dwh.dim.booking.path')) \
            .filter('booking_locator is not null') \
            .select('dim_booking_id',
                    'booking_locator')
    df_dwh_dim_supplier = \
        spark.read.parquet(config.get(config_set, 'dwh.dim.supplier.path')) \
            .filter('discontinue_date="2000-01-01" or discontinue_date="2999-12-31"') \
            .select('harp_key',
                    'dim_supplier_id')
    df_dwh_dim_traveler_profile = \
        spark.read.parquet(config.get(config_set, 'dwh.dim.traveler.profile.path')) \
            .select('dim_traveler_profile_id',
                    'traveler_guid',
                    F.col('country_code').alias('trav_country'))
    df_dwh_dim_location = spark.read.parquet(config.get(config_set, 'dwh.dim.location.path')) \
        .select('dim_location_id',
                F.col('country_code').alias('loc_country'))

    regexp_pattern = '[^a-zA-Z0-9]+'

    df_dwh = \
        df_dwh_fact_hotel \
            .join(df_dwh_dim_booking, on='dim_booking_id') \
            .join(df_dwh_dim_supplier, on='dim_supplier_id') \
            .join(df_dwh_dim_traveler_profile, on='dim_traveler_profile_id') \
            .join(df_dwh_dim_location, on='dim_location_id', how='left') \
            .select('fact_hotel_id',
                    'start_date',
                    'end_date',
                    'effective_date',
                    'booking_locator',
                    'harp_key',
                    'traveler_guid',
                    'trav_country',
                    'loc_country',
                    F.when(F.col('trav_country') == F.col('loc_country'), 'N').otherwise('Y').alias('emulation_flag')
                    ) \
            .withColumn('concat_dwh_pnr', F.concat('start_date', 'end_date', 'booking_locator')) \
            .withColumn('full_mk',
                        F.regexp_replace(F.concat('concat_dwh_pnr',
                                                  'harp_key',
                                                  'traveler_guid'),
                                         regexp_pattern, '')) \
            .withColumn('prop_mk', F.regexp_replace(F.concat('concat_dwh_pnr', 'harp_key'), regexp_pattern, '')) \
            .withColumn('pnr_mk', F.regexp_replace(F.concat('concat_dwh_pnr'), regexp_pattern, '')) \
            .drop('concat_dwh_pnr')

    df_dwh_deduped = df_dwh \
        .withColumn('rk',
                    F.rank().over(Window
                                  .partitionBy(F.col('full_mk'))
                                  .orderBy(F.col('effective_date').desc(), F.col('fact_hotel_id').desc()))) \
        .filter(F.col('rk') == 1) \
        .drop('rk') \
        .distinct() \
        .withColumn('effective_date_year', F.year('effective_date')) \
        .withColumn('effective_date_month', F.month('effective_date')) \
        .withColumn('effective_date_day', F.dayofmonth('effective_date'))

    # sbx_dst.sw_hotel_hub_dwh_mapping_new_1
    dump_partitioned_dataframe(df_dwh_deduped,
                               ['effective_date_year', 'effective_date_month', 'effective_date_day'],
                               config.get(config_set, 'hotel.hub.dwh.mapping.new.1.path')
                               )
Beispiel #33
0
def process_log_data(spark, input_data, output_data):
    """
    Process log data from Sparkify data warehouse.
    
    Read in the log data and filter on actions for song plays only.
    Extract user data, filtering out empty user ids and duplicates to create a *users* table,
    and write to parquet files.
    Create new columns for timestamp and datetime. Extract the start time (timestamp) and 
    create a *time* table with additional columns for hour, day, week, month, year, weekday.
    Write *time* table to parquet files partitioned by year and month.
    Load song data and join with log data, and extrace fact-based data to create a *songplays*
    table. Create an incremental songplay id column and write the *songplays* table to parquet
    files partitioned by year and month.

    Parameters:
        spark        : SparkSession object
        input_data   : filepath to log data directories on S3
        output_data  : filepath to table directories on S3 for storing the
                       partitioned parquet files
        
    Returns:
        none
        
    """   
    print("---[ process_log_data ]---")

    # get filepath to log data file
#     log_data = input_data + "log-data/*/*/*.json"   # with S3 bucket
    log_data = input_data + "log-data/*.json"    # local workspace

    # read log data file
    df_log_data = spark.read.json(log_data)
    
    # filter by actions for song plays
    df_log_data = df_log_data.where("page = 'NextSong'")

    # extract columns for users table    
    users_table = df_log_data \
                    .filter('userId != ""') \
                    .select(col('userId').alias('user_id'), 
                            col('firstName').alias('first_name'), 
                            col('lastName').alias('last_name'),
                            col('gender'),
                            col('level') ) \
                    .dropna(how = "any", subset = ["user_id"]) \
                    .dropDuplicates()
    
    # write users table to parquet files
    users_table.write.parquet(output_data + "users")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.datetime.fromtimestamp(int(x / 1000)) \
                                          .strftime('%Y-%m-%d %H:%M:%S'))
    df_log_data = df_log_data.withColumn( "timestamp"
                                         , to_timestamp(get_timestamp(df_log_data.ts)))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.datetime.fromtimestamp(int(x / 1000)) \
                                         .strftime('%Y-%m-%d %H:%M:%S'))
    df_log_data = df_log_data.withColumn( "datetime"
                                         , get_datetime(df_log_data.ts))
    
    # extract columns to create time table
    time_table = df_log_data.select \
                    ( col('timestamp').alias('start_time')
                    , hour('datetime').alias('hour')
                    , dayofmonth('datetime').alias('day')
                    , weekofyear('datetime').alias('week')
                    , month('datetime').alias('month')
                    , year('datetime').alias('year')
                    , date_format('datetime', 'F').alias('weekday')
    )
    
    # write time table to parquet files partitioned by year and month
    time_table.write \
              .partitionBy("year", "month") \
              .parquet(output_data + "time")

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + "song_data/*/*/*/*.json")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df_log_data \
                        .join( song_df
                             , (df_log_data.song   == song_df.title) & \
                               (df_log_data.artist == song_df.artist_name)
                             , 'left_outer') \
                        .select( col("timestamp").alias("start_time")
                               , col("userId").alias("user_id")
                               , df_log_data.level
                               , song_df.song_id
                               , song_df.artist_id
                               , col("sessionId").alias("session_id")
                               , df_log_data.location
                               , col("useragent").alias("user_agent")
                               , year("datetime").alias("year")
                               , month("datetime").alias("month") )

    # EXTRA step: add songplay_id column to the songplays table
    songplays_table = songplays_table \
                        .select( 'start_time', 'user_id', 'level', 'song_id'
                               , 'artist_id', 'session_id', 'location', 'user_agent'
                               , 'year', 'month'
                               , F.row_number() \
                                  .over( Window.partitionBy("year", "month") \
                                               .orderBy( col("start_time").desc()
                                                       , col("user_id").desc() ) ) \
                                  .alias("songplay_id") )

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month") \
                   .parquet(output_data + "songplays")
day_to_host_pair_tuple_df = logs_df.<FILL IN>

day_group_hosts = day_to_host_pair_tuple_df.<FILL IN>

day_host_count_df = day_group_hosts.<FILL IN>

daily_hosts_df = <FILL IN>

print 'Unique hosts per day:'
daily_hosts_df.show(30, False)

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import dayofmonth
day_to_host_pair_tuple_df = logs_df.select("host", dayofmonth("time").alias('day'))

day_group_hosts = day_to_host_pair_tuple_df.distinct()

day_host_count_df = day_group_hosts.groupBy("day").count().orderBy("day")

daily_hosts_df = day_host_count_df.cache()

print 'Unique hosts per day:'
daily_hosts_df.show(30, False)

# COMMAND ----------

# TEST Number of unique daily hosts (4c)
daily_hosts_list = (daily_hosts_df
                    .map(lambda r: (r[0], r[1]))
# 4b

# Number of unique hosts

unique_host_count = logs_df.select(col('host')).distinct().count()

print 'Unique hosts: {0}'.format(unique_host_count)

# 4c

# Unique daily hosts

from pyspark.sql.functions import dayofmonth

day_to_host_pair_df = logs_df.select(logs_df.host, dayofmonth(logs_df.time).alias('day')).cache()
day_group_hosts_df = day_to_host_pair_df.distinct()
daily_hosts_df = day_group_hosts_df.groupBy('day').count().sort('day', ascending = True).cache()

print 'Unique hosts per day:'
daily_hosts_df.show(30, False)


# 4d

# Prepare arrays for plotting

days_with_hosts = daily_hosts_df.map(lambda r: (r[0])).take(30)
hosts = daily_hosts_df.map(lambda r: (r[1])).take(30)
# for <FILL IN>:
#  <FILL IN>