def process_fact_table(spark): ''' Join and enrich the data Parameters: spark : Spark Session ''' files_list = [ 'equipment_sensors', 'equipment_failure_sensors', 'equipment' ] #dict comprehension df_dict = { f: read_silver(spark, 'datalake/silver/{0}/'.format(f)) for f in files_list } fact = df_dict.get('equipment_sensors').join(df_dict.get('equipment'), ['equipment_id'], how='inner') fact = fact.join(df_dict.get('equipment_failure_sensors'), ['sensor_id'], how='inner') fact = fact.select('*', dayofmonth('date').alias('day'), weekofyear('date').alias('week'), month('date').alias('month'), year('date').alias('year'), dayofweek('date').alias('weekday')) print('2-->Enrich - OK ') write_gold(fact, 'datalake/gold/fact/')
def create_immigration_calendar_dimension(df, output_data): """This function creates an immigration calendar based on arrival date :param df: spark dataframe of immigration events :param output_data: path to write dimension dataframe to :return: spark dataframe representing calendar dimension """ # create a udf to convert arrival date in SAS format to datetime object get_datetime = udf(lambda x: (dt.datetime(1960, 1, 1).date() + dt.timedelta(x)).isoformat() if x else None) # create initial calendar df from arrdate column calendar = df.select(['arrdate']).withColumn("arrdate", get_datetime(df.arrdate)).distinct() # expand df by adding other calendar columns calendar = calendar_df.withColumn('arrival_day', dayofmonth('arrdate')) calendar = calendar_df.withColumn('arrival_week', weekofyear('arrdate')) calendar = calendar_df.withColumn('arrival_month', month('arrdate')) calendar = calendar_df.withColumn('arrival_year', year('arrdate')) calendar = calendar_df.withColumn('arrival_weekday', dayofweek('arrdate')) # create an id field in calendar df calendar = calendar.withColumn('id', monotonically_increasing_id()) # write the calendar dimension to parquet file partition_columns = ['arrival_year', 'arrival_month', 'arrival_week'] calendar.write.parquet(output_data + "immigration_calendar", partitionBy=partition_columns, mode="overwrite") return calendar
def process_time(spark, log_df, output_loc): """Extract time data from input DataFrame and write to output location as collection of parquet files. Args: spark (obj): SparkSession log_df (obj): DataFrame with input data output_loc (str): output location """ get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime( '%Y-%m-%d %H:%M:%S')) # extract columns to create time table time_table = log_df.select( get_datetime('ts').cast('timestamp').alias('start_time')) time_table = time_table \ .select('start_time', hour('start_time').alias('hour'), dayofmonth('start_time').alias('day'), weekofyear('start_time').alias('week'), month('start_time').alias('month'), year('start_time').alias('year'), dayofweek('start_time').alias('weekday')) \ .dropDuplicates() # write time table to parquet file time_table.write.partitionBy( 'year', 'month').mode('append').parquet(f'{output_loc}time') print(f'Processed {time_table.count()} timestamps')
def _extract_dim_date(self, imgDF: DataFrame, output_path: str): print('Extracting dim_date from immigration data...') dateCols = [col for col in imgDF.columns if col.endswith('_date')] dates: DataFrame = None for dateCol in dateCols: dt = imgDF. \ where( F.col(dateCol).isNotNull() ). \ select( F.col(dateCol).alias('date'), F.year(F.col(dateCol)).cast('smallint').alias('year'), F.quarter(F.col(dateCol)).cast('smallint').alias('quarter'), F.month(F.col(dateCol)).cast('smallint').alias('month'), F.dayofweek(F.col(dateCol)).cast('smallint').alias('day_of_week'), F.dayofmonth(F.col(dateCol)).cast('smallint').alias('day_of_month'), F.dayofyear(F.col(dateCol)).cast('smallint').alias('day_of_year'), F.weekofyear(F.col(dateCol)).cast('smallint').alias('week_of_year') ) if dates: dates = dates.union(dt).distinct() else: dates = dt print('Saving dim_date') self._write_data(dates, output_path, 'date') print('Finished saving dim_date') print('Finished extracting dim_date from immigration data.')
def transform_i94_data(self): #Read the Immigration dataset in SAS format i94_df = self.spark.read.format('com.github.saurfang.sas.spark').load(self.load_path + config['LOAD']['I94_DATA'] ) #drop the columns that are not required for analysis i94_drop_cols = ['count','visapost','occup','matflag','biryear','insnum'] i94_df = i94_df.drop(*i94_drop_cols) #convert the numeric columns to Integer datatype int_cols = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'arrdate', 'i94mode','depdate', 'i94bir', 'i94visa'] i94_df = fn.cast_type(i94_df,dict(zip(int_cols,len(int_cols)*[IntegerType()]))) #convert the SAS date format to String format date_cols = ['arrdate','depdate'] for c in date_cols: i94_df = i94_df.withColumn(c,fn.sas_to_date_udf(i94_df[c])) #Create dates dataframe to be stored as a separate file arrdate = i94_df.select('arrdate').distinct() depdate = i94_df.select('depdate').distinct() date_df = arrdate.union(depdate) date_df = date_df.withColumnRenamed('arrdate','date') date_df = date_df.withColumn('year',F.year(date_df.date)) \ .withColumn('month',F.month(date_df.date)) \ .withColumn('day',F.dayofmonth(date_df.date)) \ .withColumn('dayofweek',F.dayofweek(date_df.date)) \ .withColumn('dayofyear',F.dayofyear(date_df.date)) #Write the dates dataframe to a file in S3 in parquet format date_df.write.mode('overwrite').parquet(self.save_path + date_save_path) #Write the I94 dataframe to a file in S3 in parquet format partioned by year and month i94_df.write.partitionBy('i94yr','i94mon').mode('overwrite').parquet(self.save_path + immigration_save_path)
def create_time_table(log_raw, output_bucket_name: str): """ Create time pyspark dataframe Arguments: log_raw {DerivativeDF} -- Log helping class for pyspak dtaframes output_bucket_name {str} -- Output in S3 location Returns: {DerivativeDF} -- time derivate dataframe """ time = DerivativeDF(log_raw.df .withColumn("hour", hour("start_time")) .withColumn("day", dayofmonth("start_time")) .withColumn("week", weekofyear("start_time")) .withColumn("month", month("start_time")) .withColumn("year", year("start_time")) .withColumn("weekday", dayofweek("start_time")) .select("start_time", "hour", "day", "week", "month", "year", "weekday") .distinct() ) time._write_to_parquet( s3_output_path=f"s3://{output_bucket_name}/time", partitions=["year", "month"] ) return time
def create_dim_time(spark, log_data_df): """Creates time dimension dateframe Args: spark: Spark client log_data_df: Sparkify event log data Returns: Time dataframe """ time_df = ( log_data_df # It's possible for timestamps to appear multiple times if two events happened at the same time .drop_duplicates(["ts"]) # Filter by NextSong since we only care about "song plays" .filter(log_data_df.page == "NextSong") # Divide the ts column by 1000 since the timestamp is in milliseconds .select(F.from_unixtime(log_data_df.ts / 1000).alias("start_time")) ) return time_df.select( time_df.start_time, F.hour(time_df.start_time).alias("hour"), F.dayofmonth(time_df.start_time).alias("day"), F.weekofyear(time_df.start_time).alias("week"), F.month(time_df.start_time).alias("month"), F.year(time_df.start_time).alias("year"), F.dayofweek(time_df.start_time).alias("weekday"), )
def get_time_analysis(data, time_name): time_types = {'month' : month('date'), 'year' : year('date'), 'dayofweek' : dayofweek('date'), 'dayofmonth' : dayofmonth('date')} group_function = time_types[time_name] # Groupby the time type, aggregate usefull columns, and sort by time data_bytime = data.groupby(group_function).agg(f.avg('star_rating'), f.count('review_id'), f.stddev('star_rating')).withColumnRenamed('avg(star_rating)', 'rating').withColumnRenamed('stddev_samp(star_ratinng)', 'std_rating').withColumnRenamed(time_name + '(date)', time_name) data_bytime_sorted = data_bytime.sort(time_name) by_time_pd = data_bytime_sorted.toPandas() #fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6)) # First plot the average rating with it's condidence interval plt.figure(figsize=(10,6)) plt.plot(by_time_pd[time_name], by_time_pd['rating']) plt.fill_between(by_time_pd[time_name], by_time_pd['rating'] - 1.96 * by_time_pd['stddev_samp(star_rating)'] / np.sqrt(by_time_pd['count(review_id)']), \ by_time_pd['rating'] + 1.96 * by_time_pd['stddev_samp(star_rating)'] / np.sqrt(by_time_pd['count(review_id)']), alpha=0.5) plt.xlabel(time_name) plt.ylabel('Average Rating') plt.title('Average Rating vs ' + time_name + ' for ' + name) plt.savefig(IMG_PATH + "rating_by_" + time_name + "_evolution_"+ name + ".png") plt.clf() plt.figure(figsize=(10,6)) plt.plot(by_time_pd[time_name], by_time_pd['count(review_id)']) plt.xlabel(time_name) plt.ylabel('Number of reviews') plt.title('Number of reviews vs ' + time_name + ' for ' + name) plt.savefig(IMG_PATH + "number_by_" + time_name + "_evolution_"+ name + ".png") plt.clf()
def test_dayofweek(self): from pyspark.sql.functions import dayofweek dt = datetime.datetime(2017, 11, 6) df = self.spark.createDataFrame([Row(date=dt)]) row = df.select(dayofweek(df.date)).first() self.assertEqual(row[0], 2)
def create_time_dimension(input_df, output_data): """ Gather time data, create dataframe and write data into parquet files. :param input_df: dataframe of input data. :param output_data: path to write data to. :return: dataframe representing time dimension """ from datetime import datetime, timedelta from pyspark.sql import types as T def convert_datetime(x): try: start = datetime(1960, 1, 1) return start + timedelta(days=int(x)) except: return None udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType()) df = input_df.select(["arrdate"])\ .withColumn("arrival_date", udf_datetime_from_sas("arrdate")) \ .withColumn('day', F.dayofmonth('arrival_date')) \ .withColumn('month', F.month('arrival_date')) \ .withColumn('year', F.year('arrival_date')) \ .withColumn('week', F.weekofyear('arrival_date')) \ .withColumn('weekday', F.dayofweek('arrival_date'))\ .select(["arrdate", "arrival_date", "day", "month", "year", "week", "weekday"])\ .dropDuplicates(["arrdate"]) tools.write_to_parquet(df, output_data, "time") return df
def process_log_data(spark, input_data, output_data): """ Aim: To read the json files from s3 and write the related columns to users, time and songplays Input: spark, input_data(source files url), output_data(target files url) Output: users, time,songplays will be written to given target url. """ # get filepath to log data file log_data = input_data + "log_data/*/*/*" # read log data file log_df = spark.read.json(log_data) # filter by actions for song plays log_df = log_df[log_df['page'] == 'NextSong'] # extract columns for users table ufcols = ["userId", "firstName", "lastName", "gender", "level"] users_table = log_df[ufcols].dropDuplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, "users"), "overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime( '%Y-%m-%d %H:%M:%S')) time_table = log_df[["ts"]] time_table = time_table.withColumn("start_time", to_timestamp(get_timestamp(time_table.ts)))\ .withColumn("hour", hour(to_timestamp(get_timestamp(time_table.ts))))\ .withColumn("day", dayofmonth(to_timestamp(get_timestamp(time_table.ts))))\ .withColumn("week", weekofyear(to_timestamp(get_timestamp(time_table.ts))))\ .withColumn("month", month(to_timestamp(get_timestamp(time_table.ts))))\ .withColumn("year", year(to_timestamp(get_timestamp(time_table.ts))))\ .withColumn("weekday", dayofweek(to_timestamp(get_timestamp(time_table.ts)))) time_table = time_table[[ "start_time", "hour", "day", "week", "month", "year", "weekday" ]].dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, "time"), "overwrite") # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(log_df, (song_df.title == log_df.song) & (song_df.duration == log_df.length) & (song_df.artist_name == log_df.artist)) songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) songplays_table = songplays_table[[ 'songplay_id', from_unixtime( (songplays_table.ts.cast('bigint') / 1000)).cast('timestamp').alias('start_time'), 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'artist_location', 'userAgent' ]] songplays_table = songplays_table.withColumn( "year", year("start_time")).withColumn("month", month("start_time")) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, "songplays"), "overwrite")
def process_time_table(df, output_data): ''' This function to select columns needed to build time table, then write data into parquet files. Input: - a dataframe of log data. - a S3 path where parquet files will be stored. Following steps will be performed: - From log dataframe, select columns and rename them. Result will be kept in a 'time_table' dataframe. - Write 'time_table' data into parquet files. ''' # extract columns to create time table time_table = df.select(df.datetime.alias('start_time'), F.hour(df.datetime).alias('hour'), F.dayofmonth(df.datetime).alias('day'), F.weekofyear(df.datetime).alias('week'), F.month(df.datetime).alias('month'), F.year(df.datetime).alias('year'), F.dayofweek(df.datetime).alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet(output_data + "time.parquet")
def write_time_table( df: DataFrame, output_path, filename: str = "time.csv", ) -> DataFrame: """ Extract a time dimension table from the staging data and save it in the csv format. Args: df: Staging dataframe containing source data. output_path: Path to where the resulting csv files are saved. filename: Optional filename of the resulting csv directory in the output path. Returns: A time dimension dataframe. """ time_table = df.drop_duplicates(["date"]).where(f.col("date").isNotNull()) time_table = (time_table.select(["date"]).withColumn( "year", f.year(time_table["date"])).withColumn( "month", f.month(time_table["date"])).withColumn( "day", f.dayofmonth(time_table["date"])).withColumn( "week", f.weekofyear(time_table["date"])).withColumn( "weekday", f.dayofweek(time_table["date"]))) if output_path: time_table.write.csv(f"{output_path}/{filename}") logger.info("Saved time table") return time_table
def create_date_table(df, output_path="./output_data/date"): """ Creates a date dimension table based off of the arrival dates from the immigration events. Args: df (Spark dataframe): input Spark dataframe with demographics data output_path (str): path to write the output dataframe to Return: DataFrame corresponding to date dimension table """ # udf to convert arrival date to datetime object get_datetime = udf(lambda x: (dt.datetime(1960, 1, 1).date() + dt. timedelta(x)).isoformat() if x else None) # use arrival date to create dataframe df_date = df.select(['arrdate' ]).withColumn("arrdate", get_datetime(df.arrdate)).distinct() df_date = df_date.withColumn('date_id', monotonically_increasing_id()) df_date = df_date.withColumn('arrival_day', dayofmonth('arrdate')) df_date = df_date.withColumn('arrival_week', weekofyear('arrdate')) df_date = df_date.withColumn('arrival_month', month('arrdate')) df_date = df_date.withColumn('arrival_year', year('arrdate')) df_date = df_date.withColumn('arrival_weekday', dayofweek('arrdate')) # write the calendar dimension to parquet file partition_columns = ['arrival_year', 'arrival_month', 'arrival_week'] df_date.write.parquet(output_path, partitionBy=partition_columns, mode="overwrite") return df_date
def createDimDate(df): ''' Creates date dimensional table from DateTime field in upstream dataframe :param df: :return: Date Dimensional Dataframe ''' df = df.withColumn("rawKey", F.col('rawKey')) df = df.withColumn("year", F.year(F.col('DateTime'))) df = df.withColumn("month", F.month(F.col('DateTime'))) df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime'))) df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime'))) df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime'))) df = df.withColumn("hour", F.hour(F.col('DateTime'))) df = df.withColumn("minute", F.minute(F.col('DateTime'))) df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm")) df = df.withColumn("quarter", F.quarter(F.col('DateTime'))) df = df.withColumn("date", F.to_date(F.col('DateTime'))) df.createOrReplaceTempView('tempDimDateTable') dimDateDF = spark.sql(" SELECT * FROM \ (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ from tempDimDateTable \ group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ order by dateMinute ASC) \ ") # Generating dateKey field dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1) # Creating dataframe including date field which will help to generate Fact table factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute')) # Dropping unnecessary rawKey field dimDateDF = dimDateDF.drop(F.col('rawKey')) return dimDateDF, factHelperDateDF
def generate_full_time_table(spark, output_folder): """ This creates a full time table until the year 2060. This should reduce the processing time a bit, since the dates don't have to processed each time new facts are added Parameters ---------- spark: SparkSession output_folder : str Folder path of processed parquet file. """ logger.info('Creating dates table') days_till_2060 = range(int(100 * 365.25)) all_dates = [(t, ) for t in days_till_2060] t_schema = T.StructType([T.StructField('i_date', T.IntegerType())]) timeframe = spark.createDataFrame(all_dates, t_schema) timeframe = timeframe.withColumn( "dt_date", F.expr("date_add(to_date('1960-01-01'), i_date)")) timeframe = timeframe.select('i_date', 'dt_date', F.year('dt_date').alias('year'), F.month('dt_date').alias('month'), F.dayofmonth('dt_date').alias('day'), F.dayofweek('dt_date').alias('weekday')) timeframe.write\ .parquet(os.path.join(output_folder, 'dates.parquet'), 'overwrite') logger.info('dates.parquet written')
def clean_data(input_df, threshold=15000): """Clean data""" # Set counts that correspond to time_diff >= 8h to NaNs # Take absolute values of entries_count and exits_count tmp_df = ( input_df .withColumn("entries_count", F.when(F.col("time_diff") >= 8, None).otherwise( F.abs(F.col("entries_count")))) .withColumn("exits_count", F.when(F.col("time_diff") >= 8, None).otherwise( F.abs(F.col("exits_count")))) ) # Replace absolute values of entries_count and exits_count > 15,000 with NaNs tmp_df = ( tmp_df .withColumn("entries_count", F.when(F.col("entries_count") > threshold, None).otherwise( F.col("entries_count"))) .withColumn("exits_count", F.when(F.col("exits_count") > threshold, None).otherwise( F.col("exits_count"))) ) # Impute NaNs with average counts of the same turnstile, hour & day of week tmp_df = ( tmp_df .withColumn("hour", F.hour("time_rounded")) .withColumn("wkdy", F.dayofweek("time_rounded")) ) tmp_df = impute_nans(tmp_df, "entries_count") tmp_df = impute_nans(tmp_df, "exits_count") # Compute traffic output_df = tmp_df.withColumn("traffic", F.col("entries_count") + F.col("exits_count")) return output_df
def create_time_table(log_data: DataFrame) -> DataFrame: """ Process the log data and create table "time". Processing: - timestamp cannot be empty - select distinct timestamps - convert unix timestamp into timestamp - extract date parts Parameters ---------- log_data : pyspark.sql.dataframe.DataFrame Log data. Returns ------- time : pyspark.sql.dataframe.DataFrame """ return (log_data.where( F.col('ts').isNotNull()).select('ts').distinct().withColumn( 'start_time', F.from_unixtime(F.col('ts') / 1000).cast('timestamp')).select( 'start_time', F.hour('start_time').alias('hour'), F.dayofmonth('start_time').alias('day'), F.weekofyear('start_time').alias('week'), F.month('start_time').alias('month'), F.year('start_time').alias('year'), F.dayofweek('start_time').alias('weekday')))
def _transform(self, df): time_variable = self.getColumn() new_time_variable = time_variable + '_new' # code from tawab. Convert all times in a same format. df = df.withColumn( new_time_variable, self.udf_date_formatting()( funct.col(time_variable).cast("String"))) df = df.withColumn( new_time_variable, funct.from_unixtime( funct.unix_timestamp(new_time_variable, self.time_format)).cast(TimestampType())) df = df.withColumn(time_variable + '_year', funct.year(new_time_variable)) df = df.withColumn(time_variable + '_month', funct.month(new_time_variable)) df = df.withColumn(time_variable + '_day', funct.dayofmonth(new_time_variable)) df = df.withColumn(time_variable + '_dayofweek', funct.dayofweek(new_time_variable)) df = df.withColumn(time_variable + '_hour', funct.hour(new_time_variable)) df = df.withColumn(time_variable + '_minutes', funct.minute(new_time_variable)) df = df.withColumn(time_variable + '_seconds', funct.second(new_time_variable)) df = df.drop(new_time_variable) df = df.drop(time_variable) return df
def process_log_data(spark, input_data, output_data): """ Process the log data file from S3, creates song and artist tables and load them back to S3 """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table user_table = df.select(col("userId"), col("firstName"), col("lastName"), col("gender"), col("level")).distinct() # write users table to parquet files user_table.write.parquet("{}users/users_table.parquet".format(output_data)) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn("timestamp", get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda x: to_date(x), TimestampType()) df = df.withColumn("start_time", get_timestamp(col("ts"))) # extract columns to create time table df = df.withColumn("hour", hour("timestamp")) df = df.withColumn("day", dayofmonth("timestamp")) df = df.withColumn("month", month("timestamp")) df = df.withColumn("year", year("timestamp")) df = df.withColumn("week", weekofyear("timestamp")) df = df.withColumn("weekday", dayofweek("timestamp")) time_table = df.select(col("start_time"), col("hour"), col("day"), col("week"), \ col("month"), col("year"), col("weekday")).distinct() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet( "{}time/time_table.parquet".format(output_data)) # read in song data to use for songplays table song_df = spark.sql( "SELECT DISTINCT song_id, artist_id, artist_name FROM song_data_table") # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \ .distinct() \ .select(col("start_time"), col("userId"), col("level"), col("sessionId"), \ col("location"), col("userAgent"), col("song_id"), col("artist_id")) \ .withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def process_log_data(spark): """ Extract and load log data from S3 then transform it users, times, and songplays parquet files in S3. Times and songplays parquet files are partitioned by year and month Arguments: Spark session Returns: None """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level") # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + "users/users.parquet") # create timestamp column from original timestamp column get_timestamp = udf(lambda x:datetime.fromtimestamp(x/1000), TimestampType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x:datetime.fromtimestamp(x/1000), DateType()) df = df.withColumn("datetime", get_datetime(df.ts)) # extract columns to create time table time_table = df.select(col("timestamp").alias("start_time"), hour(df.timestamp).alias("hour"), dayofmonth(df.timestamp).alias("day"), weekofyear(df.timestamp).alias("week"), month(df.timestamp).alias("month"), year(df.timestamp).alias("year"), dayofweek(df.timestamp).alias("weekday")).drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").mode('overwrite').parquet(output_data + "times/times.parquet") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs/songs.parquet") # read in artist data to use for songplays table artist_df = spark.read.parquet(output_data + "artists/artists.parquet") # extract columns from joined song, artist, and log datasets to create songplays table joinedSongArtist_df = song_df.join(artist_df, ['artist_id']) songplays_table = df.join(joinedSongArtist_df, (df.song == joinedSongArtist_df.title) & (df.artist == joinedSongArtist_df.artist_name) & (df.length == joinedSongArtist_df.duration), "left").select(col("timestamp").alias("start_time"), year("timestamp").alias("year"), month("timestamp").alias("month"), col("userId").alias("user_id"), df.level, joinedSongArtist_df.song_id, col("artist_id"), col("sessionId").alias("session_id"), df.location, col("userAgent").alias("user_agent")).drop_duplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").mode('overwrite').parquet(output_data + "songplays/songplays.parquet")
def process_log_data(spark, logs_df, songs_df, output_location): # filter by actions for song plays logs_df = logs_df.filter("page = 'NextSong'") # extract user data users_table = logs_df.select([ logs_df.userId.alias("user_id"), logs_df.firstName.alias("first_name"), logs_df.lastName.alias("last_name"), logs_df.gender, logs_df.level ]) users_table = users_table.distinct() # write users table to parquet files users_table.write.mode("overwrite").parquet(output_location + '/users.parquet') users_table.show(2) # build time fact table time_df = logs_df.select(getDateTime(logs_df.ts).alias("timestamp")) time_table = time_df.select([ time_df.timestamp.alias("start_time"), F.hour(time_df.timestamp).alias("hour"), F.year(time_df.timestamp).alias("year"), F.dayofmonth(time_df.timestamp).alias("day"), F.month(time_df.timestamp).alias("month"), F.dayofweek(time_df.timestamp).alias("weekday") ]) time_table = time_table.distinct() time_table.show(2) # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_location + '/time.parquet') # extract columns from joined song and log datasets to create songplays table songs_df.createOrReplaceTempView("songs") logs_df.createOrReplaceTempView("logs") spark.udf.register("getDateTime", getDateTime) songplays_table = spark.sql("""select getDateTime(l.ts) as start_time, l.userId as user_id,l.level, s.song_id, s.artist_id, l.location, l.userAgent as user_agent from logs as l left outer join songs as s on s.artist_name = l.artist and s.title = l.song and s.duration = l.length """) songplays_table = songplays_table.withColumn( "year", F.year(songplays_table.start_time)) songplays_table = songplays_table.withColumn( "month", F.year(songplays_table.start_time)) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_location + '/song_plays.parquet')
def process_log_data(spark, input_data, output_data): """ Description: Function loads data from logs in Udacity S3 bucket, processess it and after extraction of artist and songs tables loads them into my S3 bucket. Output df from earlier functin is used to filter the page == 'NextSong' Parameters: spark : Earlier initiated spark session input_data : Udacity location of log data in json format for events output_data : my S3 bucket where the dimensional tables in parquet format will be written return : None """ log_data = input_data + 'log_data/*/*/*.json' df = spark.read.json(log_data) df = df.filter(df.page == 'NextSong') users_fields = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = df.selectExpr(users_fields).dropDuplicates() users_table.write.mode('overwrite').parquet(output_data + 'users/') get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp('ts')) time_table = df.withColumn("hour",hour("start_time"))\ .withColumn("day",dayofmonth("start_time"))\ .withColumn("week",weekofyear("start_time"))\ .withColumn("month",month("start_time"))\ .withColumn("year",year("start_time"))\ .withColumn("weekday",dayofweek("start_time"))\ .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() time_table.write.mode('overwrite').partitionBy( "year", "month").parquet(output_data + 'time/') song_df = spark.read\ .format("parquet")\ .option("basePath", os.path.join(output_data, "songs/"))\ .load(os.path.join(output_data, "songs/*/*/")) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent")\ .alias("user_agent")) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\ .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") # write songplays table to parquet files partitioned by year and month songplays_table.drop_duplicates().write.parquet( os.path.join(output_data, "songplays/"), mode="overwrite", partitionBy=["year", "month"])
def add_timedelta_columns(df): df = df.withColumn("DEP_LOCAL_TIMESTAMP_PLUS_2", df.DEP_LOCAL_TIMESTAMP + expr('INTERVAL 2 HOURS')) df = df.withColumn("DEP_LOCAL_HOUR_PLUS_2", hour(df.DEP_LOCAL_TIMESTAMP_PLUS_2)) df = df.withColumn("MONTH_PLUS_2", month(df.DEP_LOCAL_TIMESTAMP_PLUS_2)) df = df.withColumn("DAY_OF_WEEK_PLUS_2", dayofweek(df.DEP_LOCAL_TIMESTAMP_PLUS_2)) df = df.withColumn("FL_DATETIMEHOUR", (unix_timestamp("FL_DATETIME") + col("DEP_LOCAL_HOUR")*60*60).cast('timestamp')) df = df.withColumn("FL_DATETIMEHOUR_PLUS_2", df.FL_DATETIMEHOUR + expr('INTERVAL 2 HOURS') ) return df
def process_log_data(spark, input_data, output_data): """Insert records into time, users, and songplays tables using data from log and song files.""" # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.selectExpr([ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ]).dropDuplicates() users_table.write.parquet(os.path.join(output_data, 'users/'), 'overwrite') # create start_time column from ts column and convert it to timestamp get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn("start_time", get_timestamp(df.ts)) # extract columns to create time table time_table = df.select("start_time").dropDuplicates().withColumn( "hour", hour(col("start_time"))).withColumn( "day", dayofmonth(col("start_time"))).withColumn( "week", weekofyear(col("start_time"))).withColumn( "month", month(col("start_time"))).withColumn( "year", year(col("start_time"))).withColumn( "weekday", dayofweek(col("start_time"))) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, 'time/'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.json( os.path.join(input_data, "song_data/*/*/*/*.json")) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join( song_df, (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration), 'left_outer').select(df.start_time, df.userId.alias("user_id"), df.level, song_df.song_id, song_df.artist_id, df.sessionId.alias("session_id"), df.location, df.userAgent.alias("user_agent")).withColumn( "songplay_id", monotonically_increasing_id()).withColumn( "year", year(col("start_time"))).withColumn( "month", month(col("start_time"))) # # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, schema, input, output): """ Processes log_data and creating users, time, and songplays tables. Params: spark(spark.SparkSession): A SparkSession object schema(spark.sql.types.StructType): A StructType object that represents the schema of the table input(str): Input path output(str): Output path """ logging.info("PROCESSING log_data...") # Read log_data df = (spark.read.json(input, schema = schema, mode = "PERMISSIVE")) \ .where(F.col("page") == "NextSong") # # Create users table and write as parquet logging.info("CREATING users...") users_table = df.selectExpr(["userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"]) \ .dropDuplicates() users_table.write \ .partitionBy("last_name") \ .parquet(os.path.join(output, "users"), mode = "overwrite") # Create time table and write as parquet logging.info("CREATING time...") df = df.withColumn( "start_time", F.to_timestamp(F.from_unixtime(df["ts"] / 1000), format="YYYY-MM-dd HH:mm:ss")) time_table = df.select(["start_time", F.hour("start_time").alias("hour"), F.dayofmonth("start_time").alias("day"), F.weekofyear("start_time").alias("week"), F.month("start_time").alias("month"), F.year("start_time").alias("year"), F.dayofweek("start_time").alias("weekday")]) \ .dropDuplicates() time_table.write \ .partitionBy(["year", "month"]) \ .parquet(os.path.join(output, "time"), mode = "overwrite") # Read songs table df_songs = spark.read.parquet(os.path.join(output, "songs")) df = df.join(df_songs, df.song == df_songs.title, how="inner") df = df.withColumn("songplay_id", F.monotonically_increasing_id() + 1) # Create songsplays table and write as parquet logging.info("CREATING songplays...") songplays_table = df.select([ "songplay_id", "start_time", F.col("userId").alias("user_id"), "level", df_songs.song_id.alias("song_id"), "artist_id", F.col("sessionId").alias("session_id"), "location", F.col("userAgent").alias("user_agent") ]) songplays_table.write \ .parquet(os.path.join(output, "songplays"), mode = "overwrite") return logging.info("PROCESSED log_data.")
def process_log_data(spark, input_data, output_data): """ Load data from log_data dataset and extract columns for user and time tables. Data is written to parquet files and stored on S3 """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").dropDuplicates() # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + "users_table/users.parquet") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp(df.ts)) # extract columns to create time table time_table = df.select("start_time") \ .withColumn("hour", hour("start_time")) \ .withColumn("day", dayofmonth("start_time")) \ .withColumn("week", weekofyear("start_time")) \ .withColumn("month", month("start_time")) \ .withColumn("year", year("start_time")) \ .withColumn("weekday", dayofweek("start_time")) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data+"time_table/time.parquet") # read in song data to use for songplays table song_df = spark.read.json(input_data + "song_data/*/*/*/*.json") # extract columns from joined song and log datasets to create songplays table joined_df = df.join(song_df, song_df.artist_name == df.artist, "inner") songplays_table = joined_df.select( col("start_time"), col("userId").alias("user_id"), col("level"), col("song_id"), col("artist_id"), col("sessionId").alias("session_id"), col("location"), year("start_time").alias("year"), month("start_time").alias("month"), col("userAgent").alias("user_agent"))\ .withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data+"songplays/songplays.parquet")
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, "log_data", "*", "*", "*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table user_columns = ['userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level'] users_table = df.selectExpr(user_columns).dropDuplicates() # write users table to parquet files users_table.write.parquet(path = os.path.join(output_data, "users"), mode = "overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000), TimestampType()) df = df.withColumn('timestamp', get_timestamp('ts')) # create column from original timestamp column get_datetime = udf(lambda x: to_date(x), DateType()) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table time_table = df.selectExpr(["timestamp as start_time"]) time_table = time_table.withColumn('hour', hour('start_time')) time_table = time_table.withColumn('day', dayofmonth('start_time')) time_table = time_table.withColumn('week', weekofyear('start_time')) time_table = time_table.withColumn('month', month('start_time')) time_table = time_table.withColumn('year', year('start_time')) time_table = time_table.withColumn('weekday', dayofweek('start_time')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(path = os.path.join(output_data, "time"), mode = "overwrite") # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs')) # extract columns from joined song and log datasets to create songplays table songplays_table = df \ .join(song_df, song_df.title == df.song, 'inner') \ .orderBy('ts') \ .selectExpr(["timestamp as start_time", "userId as user_id", "level", "song_id", "artist_id", "sessionId as session_id", "location", "userAgent as user_agent", "year(timestamp) as year", "month(timestamp) as month"]) \ .withColumn('songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(path = os.path.join(output_data, "songplays"), mode = "overwrite")
def process_log_data(spark, input_data_log, output_data): """ 1. takes a spark session, input data path, output data path 2. reads log data from input_data and songs table that is created in process_song_data 3. processes the data, creates users, time, and songplays tables 4. loads the tables to output_data """ # read log data file df = spark.read.json(input_data_log) # cast user id to int df = df.withColumn('user_id', df['userId'].cast('int')) # filter by actions for song plays df = df.where(df.page == "NextSong") # create tmp view df.createOrReplaceTempView('tmp') # extract columns for users table users_table = spark.sql(users_table_query) users_table = users_table.dropDuplicates(['user_id']) # write users table to parquet files users_table.write.parquet(output_data + 'users') # create timestamp column from original timestamp column df = df.withColumn('start_time', from_unixtime(df.ts / 1000.0)) df = df.withColumn('hour', hour(df.start_time)) df = df.withColumn('day', dayofmonth(df.start_time)) df = df.withColumn('week', weekofyear(df.start_time)) df = df.withColumn('weekday', dayofweek(df.start_time)) df = df.withColumn('month', month(df.start_time)) df = df.withColumn('year', year(df.start_time)) # create a temp view df.createOrReplaceTempView('tmp') # extract columns to create time table times_table = spark.sql(times_table_query) times_table = times_table.dropDuplicates(['start_time']) # write time table to parquet files partitioned by year and month times_table.write.partitionBy(['year', 'month']).parquet(output_data + 'times') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs') # extract columns from joined song and log datasets to create songplays table song_df.createOrReplaceTempView('tmp2') songplays_table = spark.sql(songplays_table_query) # write songplays table to parquet files partitioned by year and month songplays_table.withColumn('year', year(df.start_time))\ .withColumn('month', month(df.start_time))\ .write.partitionBy(['year', 'month']).parquet(output_data + 'songplays')
def i94_dates(spark, df): """ Build the i94_dates DataFrame which is another dimension; this is built from the arrival_date field in the immigrations dataset :params spark - A Pyspark object :params df - A Pyspark cleaned DataFrame Returns - A cleaned dimensional DataFrame """ i94_dates = (df.select( F.col('i94_year'), F.col('i94_month'), F.col('arrival_date'), F.year('arrival_date').alias('year'), F.month('arrival_date').alias('month'), F.dayofmonth('arrival_date').alias('day'), F.dayofweek('arrival_date').alias('dayofweek'), F.when((F.dayofweek('arrival_date') == 1) | (F.dayofweek('arrival_date') == 7), 'weekend').otherwise( 'weekday').alias('is_weekend')).dropDuplicates()) return i94_dates