def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = input_data + "log_data" # read log data file #df = spark.read.json(log_data + "/2018/11/2018-11-*.json") df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']) # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users') # create timestamp column from original timestamp column get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime( "%Y-%m-%d %H:%M:%S")) df = df.withColumn("start_time", get_datetime(df.ts)) df = df.withColumn("hour", hour(col("start_time")))\ .withColumn("day", dayofmonth(col("start_time")))\ .withColumn("week", weekofyear(col("start_time")))\ .withColumn("month", month(col("start_time")))\ .withColumn("year", year(col("start_time")))\ .withColumn("weekday", dayofweek(col("start_time"))) # extract columns to create time table time_table = df.select([ 'timestamp', 'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday' ]) # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'time') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + '/songs/') # extract columns from joined song and log datasets to create songplays table songplays_table=song_df.select(['song_id', 'title', 'artist_id', 'artist_name', 'duration']) \ .join(df, (song_df.artist_name == df.artist) \ & (song_df.title == df.song)) \ .select(['ts', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent', 'year', 'month']) songplays_table = songplays_table.withColumn( "songplay_id", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'songplays')
def process_log_data(spark, input_data, output_data): """ Loads data from S3, processes it to event tables, which are saved back to S3 Input: spark: A SparkSession instance input_data: location of the json files for processing output_data: S3 bucket for outputting dimensional data in parquet format """ # get filepath to log data file # Documentation example from Udacity: log_data/2018/11/2018-11-12-events.json log_data = input_data + "log_data/*/*/*.json" # read log data file df = spark.read.json(log_data, columnNameOfCorruptRecord='corrupt_record').drop_duplicates() # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select(df.userId.alias("user_id"), df.firstName.alias("first_name"), df.lastName.alias("last_name"), df.gender, df.level).drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + "users/", mode="overwrite") # create timestamp column from original timestamp column get_timestamp = F.udf(lambda x : datetime.utcfromtimestamp(int(x)/1000), T.TimestampType()) df = df.withColumn("start_time", get_timestamp('ts')) # create datetime column from original timestamp column # get_datetime = F.udf() # df = # extract columns to create time table time_table = df.withColumn("start_time", F.col("start_time")) \ .withColumn("hour", F.hour(F.col("start_time"))) \ .withColumn("day", F.dayofmonth(F.col("start_time"))) \ .withColumn("week", F.weekofyear(F.col("start_time"))) \ .withColumn("month", F.month(F.col("start_time"))) \ .withColumn("year", F.year(F.col("start_time"))) \ .withColumn("weekday", F.dayofweek(F.col("start_time"))) \ .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() # write time table to parquet files partitioned by year and month # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time/", mode="overwrite", partitionBy=["year","month"]) # read in song data to use for songplays table songs_parquet = output_data + 'songs/*/*/*.parquet' song_df = spark.read.parquet(songs_parquet) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, [df.song == song_df.title], how='inner') \ .join(time_table, df.start_time == time_table.start_time, how="inner") \ .select(F.monotonically_increasing_id().alias("songplay_id"), df.start_time, df.userId.alias("user_id"), df.level, song_df.song_id, df.artist.alias("artist_id"), df.sessionId.alias("session_id"), df.location, df.userAgent.alias("user_agent"), time_table.year, time_table.month) \ .repartition("year", "month") \ .drop_duplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays/", mode="overwrite", partitionBy=["year","month"])
def process_log_data(spark, input_data, output_data): """Reads in JSON log data and then writes users, time and songplays tables to parquet on S3. Args: spark: The current SparkSession. input_data: The S3 bucket to read in the data. output_data: The S3 bucket to write to. """ # get filepath to log data file # For working in the workspace: log_data = os.path.join(input_data, "log-data/*.json") log_data = os.path.join(input_data, "log-data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # rename the columns in df df = (df.withColumnRenamed('userId', 'user_id').withColumnRenamed( 'firstName', 'first_name').withColumnRenamed( 'lastName', 'last_name').withColumnRenamed( 'itemInSession', 'item_in_session').withColumnRenamed( 'sessionId', 'session_id').withColumnRenamed('userAgent', 'user_agent')) # extract columns for users table users_table = df.select('user_id', 'first_name', 'last_name', 'gender', 'level').distinct() # write users table to parquet files users_table.write.parquet(output_data + 'users', mode='overwrite') # create timestamp column from original timestamp column # default type is string for UDFs, so we need to switch that by specifying the correct type get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0), T.TimestampType()) df = df.withColumn('start_time', get_timestamp(df.ts)) # extract columns to create time table time_table = df.select( 'start_time', hour(col('start_time')).alias('hour'), dayofmonth(col('start_time')).alias('day'), weekofyear(col('start_time')).alias('week'), month(col('start_time')).alias('month'), year(col('start_time')).alias('year'), date_format(col('start_time'), 'EEEE').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet(output_data + 'time', mode='overwrite') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs/year=*/artist_id=*/*.parquet') artist_df = spark.read.parquet(output_data + 'artists/*.parquet') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join( song_df, (df.song == song_df.title) & (df.length == song_df.duration)).join( artist_df, df.artist == artist_df.artist_name).join(time_table, ['start_time']) # create the songplay_id column songplays_table = songplays_table.withColumn('songplay_id', monotonically_increasing_id()) # select the columns of interest songplays_table = songplays_table.select('songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month') # write songplays table to parquet files partitioned by year and month (I think this is a copy paste error because year and month aren't listed as required cols) songplays_table.write.partitionBy('year', 'month').parquet( output_data + 'songplays', mode='overwrite')
def _transform(self, df, auxiliar_train): if not self.train_file: auxiliar_train = auxiliar_train.drop('WinningBid') auxiliar_train = auxiliar_train.withColumn('test', lit(0)) df = df.withColumn('test', lit(1)) df = auxiliar_train.union(df) del auxiliar_train # We create the time as Index split_col = split(df['ApproximateDate'], ' ') df = df.withColumn('time', split_col.getItem(1)) # time # Hour Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'), IntegerType()) df = df.withColumn('hms_index', func_index(df['time'])) # We order by UserId-Date df = df.orderBy(['UserID', 'hms_index']) # We check Null Values df.select([count_(when(isnan(c), c)).alias(c) for c in df.columns]).show() # We create a rank of users by how many times in the past saw an ad w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween( Window.unboundedPreceding, 0)) df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w)) # Number of Ads/User/Second df = df.withColumn('key_id', concat(df['UserID'], lit(' '), df['hms_index'])) w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w)) # Number of Ads/User df_group = df.groupby(['key_id' ]).agg(count_('key_id').alias('count_ads')) split_col = split(df_group['key_id'], ' ') df_group = df_group.withColumn('UserID', split_col.getItem(0)) # time w = (Window().partitionBy( df_group.UserID).orderBy('key_id').rowsBetween( Window.unboundedPreceding, 0)) df_group = df_group.withColumn('number_ads_user', sum_(df_group.count_ads).over(w)) df_group = df_group.select(['key_id', 'number_ads_user']) df = df.join(df_group, how='left', on='key_id') del df_group # Number of Users/Second w = (Window().partitionBy(df.ApproximateDate).rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_user_second', approx_count_distinct(df.UserID).over(w)) # Number of Ads/Second df = df.withColumn('number_ads_second', count_(df.ApproximateDate).over(w)) # Browser Dummy Transformation types = df.select('Browser').distinct().collect() types = [val['Browser'] for val in types] new_cols = [ when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty) for ty in types ] df = df.select(df.columns + new_cols) # Decompose Date Variables df = df.withColumn('date', to_date(df['ApproximateDate'])) # date df = df.withColumn('month', month(df['ApproximateDate'])) # month df = df.withColumn('day', dayofmonth(df['ApproximateDate'])) # day df = df.withColumn('weekday', dayofweek( df['ApproximateDate'])) # weekday 1=Monday df = df.withColumn('hour', hour(df['time'])) # hour df = df.withColumn('minute', minute(df['time'])) # minute # Peak Hour df = df.withColumn('peak6am8am', when(df['hour'].between(6, 8), 1).otherwise(0)) df = df.withColumn('peak14pm16pm', when(df['hour'].between(14, 16), 1).otherwise(0)) # Minute Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'), IntegerType()) df = df.withColumn('hm_index', func_index(df['time'])) # Convert to time-series by Minute # We reduce to minutes df_time_serie_ads = df.select([ 'hms_index', 'hm_index', 'number_user_second', 'number_ads_second' ]).drop_duplicates() df_time_serie_user = df.select(['UserID', 'hm_index']).drop_duplicates() # Group-by the values df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg( approx_count_distinct('UserID')) df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({ 'number_ads_second': 'sum' }).drop_duplicates(subset=['hm_index']) # Join ads-users per minute df_time_serie = df_time_serie_ads.join(df_time_serie_user, how='left', on='hm_index') del df_time_serie_ads, df_time_serie_user # Rename columns df_time_serie = df_time_serie.withColumnRenamed( 'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed( 'approx_count_distinct(UserID)', 'number_user_minute') # Resample Range of Minutes resample_range = list( range( df_time_serie.select(min_( col('hm_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hm_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hm_index).drop( *['hm_index']).fillna(0) # Create Lags By Minutes w = Window().partitionBy().orderBy(col('value')) if self.ar_min_lag > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_minute').over(w).alias( 'ar1_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('number_ads_minute').over(w).alias( 'ar1_number_ads_minute')) if self.ar_min_lag > 1: for l in range(2, self.ar_min_lag + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_minute').over( w).alias('ar' + str(l) + '_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_minute').over( w).alias('ar' + str(l) + '_number_ads_minute')) # Remove the lagged Null Values df_time_serie = df_time_serie.dropna() # join and remove lag Null values of the first minute df = df.orderBy(['UserID', 'hms_index']) df = df.join(df_time_serie.orderBy(['hm_index']), how='left', on=df.hm_index == df_time_serie.value).drop('value') # Convert to time-series and resample by Seconds df_time_serie = df.select( ['hms_index', 'number_user_second', 'number_ads_second']).drop_duplicates() resample_range = list( range( df_time_serie.select(min_( col('hms_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hms_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hms_index).drop( *['hms_index']).fillna(0) # Create lags w = Window().partitionBy().orderBy(col('value')) if self.ar_lags > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_second').over(w).alias( 'ar1_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('number_ads_second').over(w).alias( 'ar1_number_ads_second')) if self.ar_lags > 1: for l in range(2, self.ar_lags + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_second').over( w).alias('ar' + str(l) + '_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_second').over( w).alias('ar' + str(l) + '_number_ads_second')) # Create Moving Average if self.ma_ss_lag is not None: # Get hour from index func_index = udf(lambda x: auxiliar_func.num_to_time(x), StringType()) df_time_serie = df_time_serie.withColumn( 'time', func_index(df_time_serie['value'])) # minute MA terms (Average per second last xx seconds) if self.ma_ss_lag is not None: for lag_val in self.ma_ss_lag: # range to take into account w = (Window.orderBy(df_time_serie['value']).rangeBetween( -lag_val, 0)) # MA variables df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', avg('number_user_second').over(w)) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', avg('number_ads_second').over(w)) # Increasing ID df_time_serie = df_time_serie.withColumn( 'rn', monotonically_increasing_id()) # Replace first values by Null df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'])) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'])) # Get the average by Minute df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_user_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'] * 60) df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_ads_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'] * 60) df_time_serie = df_time_serie.drop(*['rn']) # Remove the lagged Null Values df_time_serie = df_time_serie.drop( *['time', 'number_user_second', 'number_ads_second']).dropna() # join and remove lag Null values of the first minute df = df.join( df_time_serie.orderBy(['value']), how='left', on=df.hms_index == df_time_serie.value).drop('value').dropna() if self.train_file and not self.variable_analysis: df = df.select([ 'key_id', 'hms_index', 'number_ads_user', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')] + ['WinningBid']) if not self.train_file: df = df.filter(df['test'] == 1) df = df.select([ 'UserID', 'key_id', 'number_ads_user', 'hms_index', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')]) df = df.orderBy(['hms_index', 'UserID']) df.show() return df
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" song_data = input_data + "song_data/*/*/*/*.json" #song_data = input_data + "song_data/A/A/A/*.json" DEBUG and print("Reading log data files from", log_data) # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') \ .where(df.ts.isNotNull()) \ .withColumn("userId", df["userId"].cast(IntegerType())) \ .withColumn("sessionId", df["sessionId"].cast(IntegerType())) DEBUG and print("Preparing users table") # extract columns for users table users_table = df.select( "userId", "firstName", "lastName", "gender", "level").where(col("userId").isNotNull()).dropDuplicates(['userId']) DEBUG and print("Creating and persisting users table") # write users table to parquet files users_table.write.parquet(output_data + "users/", mode='overwrite') DEBUG and print("Creating and persisting time table") # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp(df.ts)) # extract columns to create time table time_table = df.withColumn("hour", hour(df.start_time)) \ .withColumn("day", dayofmonth(df.start_time)) \ .withColumn("week", weekofyear(df.start_time)) \ .withColumn("month", month(df.start_time)) \ .withColumn("year", year(df.start_time)) \ .withColumn("weekday", dayofweek(df.start_time)) \ .select("start_time", "hour", "day", "week", "month", "year", "weekday") \ .dropDuplicates(["start_time"]) # write time table to parquet files partitioned by year and month time_table.write.partitionBy(["year", "month"]).parquet(output_data + "times/", mode='overwrite') DEBUG and print("Creating and persisting songplays table") # read in song data to use for songplays table song_df = spark.read.json(song_data).select("song_id", "title", "artist_id", "artist_name") action_df = df.select("start_time", "userId", "level", "sessionId", "location", "userAgent", "artist", "song") # extract columns from joined song and log datasets to create songplays table songplays_table = action_df.join(song_df, (action_df.artist == song_df.artist_name) & (action_df.song == song_df.title)) \ .select(monotonically_increasing_id().alias("songplay_id"), "start_time", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent") \ .withColumn("month", month(df.start_time)) \ .withColumn("year", year(df.start_time)) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy(["year", "month" ]).parquet(output_data + "songplays/", mode='overwrite')
def start_stream(args): validate_params(args) _, brokers, topic = args spark = create_spark_session() json = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .load() json.printSchema() # Explicitly set schema schema = StructType([ StructField("symbol", StringType(), False), StructField("timestamp", TimestampType(), False), StructField("price", DoubleType(), False) ]) json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"} stocks_json = json \ .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content")) stocks_json.printSchema stocks = stocks_json.select("content.*") #################################### # Stream to Parquet #################################### query = stocks \ .withColumn('year', year(F.col('timestamp'))) \ .withColumn('month', month(F.col('timestamp'))) \ .withColumn('day', dayofmonth(F.col('timestamp'))) \ .withColumn('hour', hour(F.col('timestamp'))) \ .withColumn('minute', minute(F.col('timestamp'))) \ .writeStream \ .format('parquet') \ .partitionBy('year', 'month', 'day', 'hour', 'minute') \ .option('startingOffsets', 'earliest') \ .option('checkpointLocation', '/dataset/checkpoint') \ .option('path', '/dataset/streaming.parquet') \ .trigger(processingTime='30 seconds') \ .start() query.awaitTermination() # avg_pricing = stocks \ # .groupBy(F.col("symbol")) \ # .agg(F.avg(F.col("price")).alias("avg_price")) #################################### # Console Output #################################### # query2 = avg_pricing.writeStream \ # .outputMode('complete') \ # .format("console") \ # .trigger(processingTime="10 seconds") \ # .start() # query2.awaitTermination() #################################### # Table in Memory #################################### # query3 = avg_pricing \ # .writeStream \ # .queryName("avgPricing") \ # .outputMode("complete") \ # .format("memory") \ # .trigger(processingTime="10 seconds") \ # .start() # # while True: # print('\n' + '_' * 30) # # interactively query in-memory table # spark.sql('SELECT * FROM avgPricing').show() # print(query3.lastProgress) # sleep(10) # query3.awaitTermination() #################################### # Writing to Postgres #################################### # Simple insert # query = stream_to_postgres(stocks) # query.awaitTermination() # Average Price Aggregation # query = stream_aggregation_to_postgres(stocks) # query.awaitTermination() # Final Average Price Aggregation with Timestamp columns # query = stream_aggregation_to_postgres_final(stocks) # query.awaitTermination() pass
def process_log_data(spark, input_data, output_data): ''' Get the files from log folders and compose a DataFrame. Create the users, time and songplays tables with the desired columns and format. Parameters: spark (object): Previous created spark object. input_data(string): Key for AWS S3 objects to read. output_data(string): Key for AWS S3 objects to save. Returns: None ''' # get filepath to log data file log_data = input_data + 'log_data' # read log data file # smaller data to test: s3a://{}:{}@udacity-dend/log_data/2018/11/2018-11-12*.json df = spark.read.json("s3a://{}:{}@udacity-dend/log_data/*/*/*.json"\ .format(os.environ['AWS_ACCESS_KEY_ID'],os.environ['AWS_SECRET_ACCESS_KEY'])) # filter by actions for song plays df = df.filter(df['page'] == 'NextSong') # extract columns for users table users_columns = ['userId', 'firstName', 'lastName', 'gender', 'level'] users_table = df.select(*users_columns).dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + '/users', mode='overwrite') # create datetime column from original timestamp column df = df.withColumn('datetime', from_unixtime(col('ts') / 1000)) # extract columns to create time table df_time = df.select('datetime').dropDuplicates() time_table = df_time.withColumnRenamed('datetime', 'start_time')\ .orderBy('start_time', ascending=True)\ .withColumn('hour', hour(col('start_time')))\ .withColumn('day', dayofmonth(col('start_time')))\ .withColumn('week', weekofyear(col('start_time')))\ .withColumn('month', month(col('start_time')))\ .withColumn('year', year(col('start_time')))\ .withColumn('weekday', dayofweek(col('start_time'))) # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + '/time', mode='overwrite', partitionBy=['year', 'month']) # read in song data to use for songplays table basePath = output_data + '/songs/' song_df = spark.read.option("basePath", basePath).parquet(output_data + '/songs/*') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='left') songplays_table = songplays_table.drop('song', 'artist', 'title', 'year', 'duration') columns_name = [ 'start_time', 'user_id', 'level', 'session_id', 'location', 'user_agent', 'song_id', 'artist_id' ] songplays_table = songplays_table.toDF(*columns_name) songplays_table = songplays_table.withColumn('month', month(col('start_time')))\ .withColumn('year', year(col('start_time'))) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + '/songplays', mode='overwrite', partitionBy=['year', 'month'])
def process_log_data(spark, input_data, output_data): """ Loads the Log files, extracts the data for users table, time table and songplays table then saves it to parquet files Parameters: spark: spark session input_data: input files path output_data: output files path """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + "users/", mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.utcfromtimestamp(int(ts) / 1000), TimestampType()) df = df.withColumn("timestamp", get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda ts: to_date(ts), TimestampType()) df = df.withColumn("start_time", get_timestamp(col("ts"))) # extract columns to create time table df = df.withColumn("hour",hour("start_time"))\ .withColumn("day",dayofmonth("start_time"))\ .withColumn("week",weekofyear("start_time"))\ .withColumn("month",month("start_time"))\ .withColumn("year",year("start_time"))\ .withColumn("weekday",dayofweek("start_time")) time_table = df.select("start_time", "hour", "day", "week", "month", "year", "weekday").distinct() # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time_table/", mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.sql( "SELECT DISTINCT song_id, artist_id, artist_name FROM df_songs_table") # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \ .distinct() \ .select("start_time", "userId", "level", "sessionId", "location", "userAgent","song_id","artist_id", "month", "year") \ .withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(OUTPUT_DATA + "songplays_table/", mode="overwrite", partitionBy=["year", "month"])
users_table = df.select(user_fields).dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users/') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x/1000, Dbl()) df = df.withColumn('ts2', get_timestamp('ts')) # create datetime column from original timestamp column df = df.withColumn('start_time', from_unixtime('ts2').cast(dataType=TimestampType())) # extract columns to create time table time_table = df.select('start_time')\ .dropDuplicates()\ .withColumn('hour', hour(col('start_time')))\ .withColumn('day', dayofmonth(col('start_time')))\ .withColumn('week', weekofyear(col('start_time')))\ .withColumn('month', month(col('start_time')))\ .withColumn('year', year(col('start_time')))\ .withColumn('weekday', date_format(col('start_time'), 'E') # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + 'time/') # read in song data to use for songplays table song_df=spark.read.parquet(output_data + 'songs/*/*/*.parquet') songs_logs=df.join(song_df, (df.song == song_df.title)) songplays = songs_logs.join(time_table,
# TEST Five dates for 404 requests (4g) Test.assertEquals([(r[0], r[1]) for r in top_err_date_df.take(5)], [(7, 532), (8, 381), (6, 372), (4, 346), (15, 326)], 'incorrect top_err_date_df') # COMMAND ---------- # MAGIC %md # MAGIC ### (5h) Exercise: Hourly 404 Errors # MAGIC # MAGIC Using the DataFrame `not_found_df` you cached in the part (5a) and sorting by hour of the day in increasing order, create a DataFrame containing the number of requests that had a 404 return code for each hour of the day (midnight starts at 0). Cache the resulting DataFrame `hour_records_sorted_df` and print that as a list. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import hour hour_records_sorted_df = not_found_df.select(hour('time').alias('hour')).groupBy('hour').count().cache() print 'Top hours for 404 requests:\n' hour_records_sorted_df.show(24) # COMMAND ---------- # TEST Hourly 404 response codes (5h) errs_by_hour = [(row[0], row[1]) for row in hour_records_sorted_df.collect()] expected = [ (0, 175), (1, 171), (2, 422), (3, 272),
def process_log_data(spark, input_data, output_data): """ Description: - Extract log data from JSON files stored in S3 bucket - Transforms log data into three separate DataFrames; users_table, time_table and songplays_table - Loads them back into s3 as parquet files stored in a separate s3-bucket for analytical purposes Arguments: - Parameter spark: the instantiated SparkSession - Parameter input_data: input path - Parameter output_data: output path Returns: - None """ # get filepath to log data file log_data = input_data + "log_data/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df["page"] == "NextSong") # extract columns for users table users_table = df.selectExpr(["userId as user_id" ,\ "firstName as first_name" ,\ "lastName as last_name" ,\ "gender" ,\ "level"]).dropDuplicates() # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + "users.parquet") # create timestamp column from original timestamp column get_timestamp = udf( lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000), TimestampType()) df = df.withColumn('timestamp', get_timestamp(col('ts'))) # create datetime column from original timestamp column get_datetime = udf( lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000), DateType()) df = df.withColumn('datetime', get_datetime(col('ts'))) # extract columns to create time table time_table = df.select([hour("timestamp").alias("hour") ,\ dayofmonth("timestamp").alias("day") ,\ weekofyear("timestamp").alias("week") ,\ month("timestamp").alias("month") ,\ year("timestamp").alias("year") ,\ date_format("timestamp", 'E').alias("weekday")]).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "time.parquet") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(df, df.song == song_df.title)\ .selectExpr(["timestamp as start_time" ,\ "userid as user_id" ,\ "level" ,\ "song_id" ,\ "artist_id" ,\ "sessionid as session_id" ,\ "location" ,\ "useragent as user_agent"]) \ .withColumn("year", year("start_time")) \ .withColumn("month", month("start_time")).dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.createOrReplaceTempView("songplays") spark.sql(""" SELECT row_number() over (order by start_time asc) as songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent, year, month FROM songplays """).write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "songplays.parquet")
plt.ylabel('404 Errors') plt.axhline(linewidth=3, color='#999999') plt.axvline(linewidth=2, color='#999999') display(fig) display(errors_by_date_sorted_df) #Top Five Days for 404 Errors top_err_date_df = errors_by_date_sorted_df.sort('count',ascending=False) print 'Top Five Dates for 404 Requests:\n' top_err_date_df.show(5) #Hourly 404 Errors from pyspark.sql.functions import hour hour_records_sorted_df = not_found_df.groupBy(hour('time').alias('hour')).count() print 'Top hours for 404 requests:\n' hour_records_sorted_df.show(24) hour_records_sorted_df.cache() #Visualizing the 404 Response Codes by Hour hours_with_not_found =map (lambda hour: hour[0],(hour_records_sorted_df.select('hour').collect())) not_found_counts_per_hour = map (lambda counts: counts[0],(hour_records_sorted_df.select('count').collect())) print hours_with_not_found print not_found_counts_per_hour fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50)) colorMap = 'seismic' cmap = cm.get_cmap(colorMap)
cmap = cm.get_cmap(colorMap) plt.plot(days_with_errors_404, errors_404_by_day, color=cmap(0), linewidth=3) plt.axis([0, max(days_with_errors_404), 0, max(errors_404_by_day)]) plt.xlabel('Day') plt.ylabel('404 Errors') plt.axhline(linewidth=3, color='#999999') plt.axvline(linewidth=2, color='#999999') display(fig) # Top Five Days for 404 Errors top_err_date_df = errors_by_date_sorted_df.sort("count",ascending=False) print 'Top Five Dates for 404 Requests:\n' top_err_date_df.show(5) # Hourly 404 Errors from pyspark.sql.functions import hour hour_records_sorted_df = not_found_df.select(not_found_df['status'],hour(not_found_df['time']).alias('hour')).groupBy('hour').count().sort('hour',ascending=True).cache() print 'Top hours for 404 requests:\n' hour_records_sorted_df.show(24) # Visualizing the 404 Response Codes by Hour hours_with_not_found = [] not_found_counts_per_hour = [] for x,y in hour_records_sorted_df.select('hour','count').collect(): hours_with_not_found.append(x) not_found_counts_per_hour.append(y) print hours_with_not_found print not_found_counts_per_hour fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50)) colorMap = 'seismic' cmap = cm.get_cmap(colorMap)
# Pickups/Dropoffs in entire NYC taxi_nyc_df = taxi_df.groupby(taxi_df.Time).agg(*sum_aggregations('Nyc')).cache() taxi_nyc_1h_df = get_agg_taxi_df(taxi_nyc_df, 1, 'Time', sum_aggregations('Nyc', 1)) taxi_nyc_4h_df = get_agg_taxi_df(taxi_nyc_df, 4, 'Time', sum_aggregations('Nyc', 4)) # Time features date_df = taxi_df.select(taxi_df.Time).distinct() weekday_udf = udf(lambda date_time: date_time.weekday(), IntegerType()) is_holiday_udf = udf(lambda date_time: date_time.date() in holidays.UnitedStates(), BooleanType()) date_df = date_df.withColumn('Hour', func.hour(date_df.Time)) date_df = date_df.withColumn('Day_Of_Week', weekday_udf(date_df.Time)) date_df = date_df.withColumn('Day_Of_Year', func.dayofyear(date_df.Time)) date_df = date_df.withColumn('Is_Holiday', is_holiday_udf(date_df.Time)) # Aggregate events happening in last and next 3 hours for each hour event_3h_df = event_df.withColumnRenamed('Venues', 'Venues_0h') for i in range(-3, 4): if i != 0: add_hours_udf = udf(lambda date_time: date_time + datetime.timedelta(hours=i), TimestampType()) event_3h_df = event_3h_df.join(event_df.withColumn('Time', add_hours_udf(event_df.Time)).withColumnRenamed('Venues', 'Venues_%sh' % str(i)), 'Time') # Join single feature groups features_df = taxi_df.select(index_columns + [taxi_df.Pickup_Count]) \
def process_log_data(spark, input_data, output_data): """ In this function we are loading the song_data file and create tables for songplays,users and time tables. Input: Sparksession, Input_data filepath for songs data Output_data filepath for songs data Output: We produce parquet files for songplays,users and time tables. """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "NextSong") # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] # write users table to parquet files users_table = users_table.write.partitionBy('userId').parquet( os.path.join(output_data, 'users.parquet'), 'overwrite') print("users_table partitioned!") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: tstodatetime(x)) df = df.withColumn('daytime', get_timestamp(col("ts"))) # extract columns to create time table time_table = df.select( col("ts").alias('start_time'), year('daytime').alias('year'), month('daytime').alias('month'), dayofmonth('daytime').alias('day'), hour('daytime').alias('hour'), weekofyear('daytime').alias('weekofyear')) #We are going to partition later in the code! # read in song data to use for songplays table sqlContext = SQLContext(spark) songs_table = sqlContext.read.parquet( 'data/outputs/song_data/songs.parquet') # extract columns from joined song and log datasets to create songplays table songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location', 'userAgent', 'song'] #add artists id and song id by joining with songs_table songplays_table = songplays_table.alias('s').join(songs_table.alias('e'),col('e.title') == col('s.song'))\ .select(col('s.ts').alias('start_time'), col('s.userId'), col('s.level'), col('s.sessionId'), col('s.location'), col('s.userAgent'), col('s.song'), col('e.artist_id').alias('artist_id'), col('e.song_id').alias('song_id')) #add month and year for partitioning later based on those time_table_short = time_table['start_time', 'month', 'year'] songplays_table = songplays_table.alias('s').join(time_table_short.alias('t'),col('t.start_time') == col('s.start_time'))\ .select(col('s.start_time'), col('s.userId'), col('s.level'), col('s.sessionId'), col('s.location'), col('s.userAgent'), col('s.song'), col('s.artist_id'), col('s.song_id'), col('t.year'), col('t.month'), ) # write time table to parquet files partitioned by year and month time_table = time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'times.parquet'), 'overwrite') print("time_table partitioned!") # write songplays table to parquet files partitioned by year and month songplays_table = songplays_table.write.partitionBy( 'year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite') print("songplays_table partitioned!")
def main(): """ Get observations near locations from SmartMet Server Data start and end time and timestep is fetched from the data. Dataset is assumed coherent in means of time and locations. I.e. timestep is assumed to be constant between start and end time. """ log1=logging.getLogger("driver") output_directory = 'gs://{}/hadoop/tmp/bigquery/pyspark_output'.format(bucket) output_files = output_directory + '/part-*' # The trains stations data in stations.json # The type of trains and their delay in gratu_a_b_2010-14.csv JSON_PATH="gs://trains-data/data/stations.json" CSV_PATH="gs://trains-data/data/full/gratu_a_b_2010-14.csv" train_stations_df = spark.read \ .json(JSON_PATH) # parameters for weather data to be fetched from Smartmet server params, names = read_parameters('parameters_shorten.txt') # base URL for the surface data baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?format=ascii&separator=,&producer=fmi&tz=local&timeformat=xml×tep=60&numberofstations=5&maxdistance=100000¶m={params}'.format(params=','.join(params)) urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlons=%s,%s' % (x.stationShortCode,baseurl,x.latitude,x.longitude)]).repartition(16) data = urlist.map(read_from_URL)\ .filter(lambda x: x != -1)\ .flatMap(lambda x:x.splitlines())\ .map(lambda x: x.split(',')) newColumns=names+["trainstation"] schemaString = ' '.join(str(x) for x in newColumns) fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) # Apply the schema to the RDD. station_weather_df = spark.createDataFrame(data, schema) station_weather_df = station_weather_df.withColumn("time", to_utc_timestamp(station_weather_df.time, "%Y-%m-%dT%H")) # calculate max_precipitation 3h and max_precipitation6h col="max_precipitation1h" # to change the "no precipiation values" -1.0 to 0.0 station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col] == -1.0, 0.0).otherwise(station_weather_df[col])) # using window functions to calculate the precipitation for the # previous 3 hours and 6 hours w3 = w.partitionBy("trainstation")\ .orderBy(station_weather_df["time"])\ .rowsBetween(-2,0) station_weather_df =station_weather_df.withColumn("max_precipitation3h",f.sum("max_precipitation1h").over(w3)) w6 = w.partitionBy("trainstation")\ .orderBy(station_weather_df["time"])\ .rowsBetween(-5,0) station_weather_df =station_weather_df.withColumn("max_precipitation6h",f.sum("max_precipitation1h").over(w6)) # making the surface observation dataframe cols = station_weather_df.columns # list of all columns for col in cols: station_weather_df = station_weather_df.fillna({col:"-99"}) station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col].isin("null", "nan", "NaN", "NULL"),"-99").otherwise(station_weather_df[col])) log1.info("Retrieved surface data") ## Get flash data baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?param=time,peak_current&producer=flash&tz=local&timeformat=xml&format=ascii&separator=,' urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlon=%s,%s:30' % (x.stationShortCode,baseurl,x.latitude,x.longitude)]) data = urlist.map(getFlash)\ .filter(lambda x: x != -1)\ .flatMap(lambda x:x.splitlines())\ .map(lambda x: x.split(',')) schemaString = 'time peakcurrent trainstation' fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) flash_df = spark.createDataFrame(data, schema) flash_df = flash_df.withColumn("time", to_utc_timestamp(flash_df.time, "%Y%m%dT%HMS")) # find the count of flashes in each hour extended = (flash_df .withColumn("date", f.col("time").cast("date")) .withColumn("hour", f.hour(f.col("time")))) flash_aggs = extended.groupBy("trainstation", "date", "hour").count() flash_aggs = flash_aggs.withColumn('time', f.concat(f.col("date"), f.lit("T"), f.col("hour"))) flash =flash_aggs.withColumn('time',to_utc_timestamp(flash_aggs.time,"%Y-%m-%dT%H")).select("time", f.col("count").alias("flashcount"),"trainstation") log1.info("Retrieved flash data") # Combining surface and flash data cond = [flash.time == station_weather_df.time, flash.trainstation == station_weather_df.trainstation ] station_weather_flash_df = station_weather_df.alias('a').join(flash.alias('b'),cond, 'outer').select('a.*', 'b.flashcount').fillna({'flashcount':'0'}) # Reading the train type and delay data df = spark.read \ .csv(CSV_PATH) # combining the date and time columns and selecting the relevant columns df = df.withColumn('t', f.concat(f.col("_c0"), f.lit("T"), f.col("_c1"))).select("t","_c3", "_c4", "_c9", "_c7", "_c5") # converting the time to utc timestamp and adding 1 hour df = df.withColumn('t',to_utc_timestamp(df.t,"%Y-%m-%dT%H") + f.expr('INTERVAL 1 HOUR')) trains_df = df.select(f.col("t").alias("time"),f.col("_c3").alias("trainstation"), f.col("_c4").alias("train_type"), f.col("_c9").alias("train_count"), f.col("_c7").alias("total_delay"), f.col("_c5").alias("delay")) # Combining the weather data both surface and flash with #he train delay and type data cond = [trains_df.time == station_weather_flash_df.time, trains_df.trainstation == station_weather_flash_df.trainstation ] trains_station_weather_flash_delay_df = trains_df.join(station_weather_flash_df, cond).drop(station_weather_flash_df.time).drop(station_weather_flash_df.trainstation) log1.info("Created the dataframe with train delay and weather observations Finished!\n") # Saving the data to BigQuery (trains_station_weather_flash_delay_df .write.format('json').save(output_directory)) # Shell out to bq CLI to perform BigQuery import. subprocess.check_call( 'bq load --source_format NEWLINE_DELIMITED_JSON ' '--replace ' '--autodetect ' '{dataset}.{table} {files}'.format( dataset=output_dataset, table=output_table, files=output_files ).split()) # Manually clean up the staging_directories, otherwise BigQuery # files will remain indefinitely. output_path = spark._jvm.org.apache.hadoop.fs.Path(output_directory) output_path.getFileSystem(spark._jsc.hadoopConfiguration()).delete( output_path, True) elapsed_time = time.time() - start_time log1.info("Elapsed time to retreive train delay and observation data and save to bq {:10.3f}".format(elapsed_time))
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = input_data + "log-data/*.json" # read log data file df = spark.read.json(log_data).dropDuplicates() # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select( ["userId", "firstName", "lastName", "gender", "level"]).distinct() users_table.createOrReplaceTempView("users") # write users table to parquet files users_table.write.parquet(output_data + "users/users.parquet", "overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn("timestamp", get_timestamp("ts")) # create datetime column from original timestamp column df = df.withColumn("datetime", get_timestamp("ts")) # extract columns to create time table df = df.withColumn("start_time", get_timestamp("ts")) df = df.withColumn("hour", hour("timestamp")) df = df.withColumn("day", dayofmonth("timestamp")) df = df.withColumn("week", weekofyear("timestamp")) df = df.withColumn("month", month("timestamp")) df = df.withColumn("year", year("timestamp")) time_table = df.select( ["start_time", "hour", "day", "week", "month", "year"]).distinct() time_table.createOrReplaceTempView("time") # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( output_data + "time/time.parquet", "overwrite") # read in song data to use for songplays table # And create a log table # No need to create a new / separate 'song_df' var here, since we already have the 'songs' table created above df.createOrReplaceTempView("log_df") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT monotonically_increasing_id() as songplay_id, log_df.start_time as start_time, time.year as year, time.month as month, log_df.userId as user_id, log_df.level as level, songs.song_id as song_id, songs.artist_id as artist_id, log_df.sessionId as session_id, log_df.location as location, log_df.userAgent as user_agent FROM log_df JOIN songs ON log_df.song == songs.title JOIN time ON log_df.start_time == time.start_time """) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( output_data + "songplays/songplays.parquet", "overwrite")
StructField('total_amount', DoubleType()), StructField('payment_type', IntegerType()), StructField('trip_type', IntegerType()), StructField('congestion_surcharge', DoubleType()), ]) trip_data = spark.read \ .option("header", True) \ .schema(trip_schema) \ .csv("./data/green/*") trip_data.printSchema() # trip_data.write.mode("overwrite").parquet("./values/taxi_green") # trip_data = spark.read.parquet("./values/taxi_green") extended_trips = trip_data \ .withColumn("pick_date", f.to_date(trip_data["lpep_pickup_datetime"])) \ .withColumn("pick_hour", f.hour(trip_data["lpep_pickup_datetime"]))\ .withColumn("drop_date", f.to_date(trip_data["lpep_dropoff_datetime"])) \ .withColumn("drop_hour", f.hour(trip_data["lpep_dropoff_datetime"])) \ .withColumn("duration", f.unix_timestamp(trip_data["lpep_dropoff_datetime"]) - f.unix_timestamp(trip_data["lpep_pickup_datetime"])) extended_trips = extended_trips.filter((trip_data["lpep_pickup_datetime"] > '2020-01-01 00:00:00')) hourly_taxi_trips = extended_trips \ .groupBy("pick_date", "pick_hour").agg( f.count(extended_trips["fare_amount"]).alias("trip_count"), f.sum(extended_trips["passenger_count"]).alias("passenger_count"), f.sum(extended_trips["fare_amount"]).alias("fare_amount"), f.sum(extended_trips["tip_amount"]).alias("tip_amount"), f.sum(extended_trips["total_amount"]).alias("total_amount"), f.avg(extended_trips["duration"]).alias("avg_duration") ) # hourly_taxi_trips.write.mode("overwrite").parquet("./values/taxi-trips-hourly")
# 5g # Top days for 404 errors top_err_date_df = errors_by_date_sorted_df.sort('count', ascending=False) print 'Top Five Dates for 404 Requests:\n' top_err_date_df.show(5) # 5h # Sort from pyspark.sql.functions import hour hour_records_sorted_df = not_found_df.groupBy(hour('time').alias('hour')).count().sort('hour', ascending=True).cache() print 'Top hours for 404 requests:\n' hour_records_sorted_df.show(24) # 5i # Plot 404 errors by hour hours_with_not_found = [(row[0]) for row in hour_records_sorted_df.collect()] not_found_counts_per_hour = [(row[1]) for row in hour_records_sorted_df.collect()] print hours_with_not_found print not_found_counts_per_hour fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50))
def process_log_data(spark, input_data, output_data): """ Description: Process the event log file and extract data for table time, users and songplays from it. :param spark: a spark session instance :param input_data: input file path :param output_data: output file path """ # get filepath to log data file log_data = os.path.join(input_data, "log-data/") # read log data file df = spark.read.json( log_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record').drop_duplicates() # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, "users/"), mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table time_table = df.withColumn("hour",hour("start_time"))\ .withColumn("day",dayofmonth("start_time"))\ .withColumn("week",weekofyear("start_time"))\ .withColumn("month",month("start_time"))\ .withColumn("year",year("start_time"))\ .withColumn("weekday",dayofweek("start_time"))\ .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.parquet(os.path.join(output_data, "time_table/"), mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.read\ .format("parquet")\ .option("basePath", os.path.join(output_data, "songs/"))\ .load(os.path.join(output_data, "songs/*/*/")) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\ .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") # write songplays table to parquet files partitioned by year and month songplays_table.drop_duplicates().write.parquet( os.path.join(output_data, "songplays/"), mode="overwrite", partitionBy=["year", "month"])
def process_log_data(spark, input_data, output_data): ''' Process the log data from the file(s) specified in the parameters. Args: spark: the spark session input_data: output_data: Returns: modeled data from logs and songs json files that are written to parquet files back on S3 ''' # get filepath to log data file log_data = input_data + "log_data/*/*" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender').alias('gender'), col('level').alias('level')).distinct() # write users table to parquet files users_table.write.parquet(output_data + "users.parquet", mode="overwrite") # create timestamp column from original timestamp column df = df.withColumn( 'timestamp', f.to_timestamp( f.from_unixtime((col('ts') / 1000), 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")) # create datetime column from original timestamp column df = df.withColumn('ts_datetime', f.to_datetime(col['ts']).cast('Datetime')) # extract columns to create time table time_table = df.withColumn("hour", hour(col("timestamp"))) \ .withColumn("day", dayofmonth(col("timestamp"))) \ .withColumn("week", weekofyear(col("timestamp"))) \ .withColumn("month", month(col("timestamp"))) \ .withColumn("year", year(col("timestamp"))) \ .withColumn("weekday", datetime.datetime(col("timestamp")).weekday()) \ .select( col("timestamp").alias("start_time"), col("hour"), col("day"), col("week"), col("month"), col("year"), col("weekday") ) # write time table to parquet files partitioned by year and month time_table.parquet(output_data + "time.parquet", mode="overwrite") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table songplays_table = df.withColumn( 'songplay_id', F.monontonically_increasing_id()).join( song_df, song_df.title == df.song).select( 'songplay_id', col().alias('start_time'), col('userId').alias('user_id'), 'level', 'song_id', 'artist_id', col('sessionId').alias('session_id'), 'location', col('userAgent').alias('user_agent')) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays.parquet", mode="overwrite")
def process_log_data(spark, input_data, output_data): """ Description: This function loads log_data from S3 and extracts the songs and artist tablesafter processing and then write those generated tables to S3 in parquet format. Also output from previous function is used in by spark.read.json command Parameters: spark : Spark Session input_data : Location of log_data files output_data : S3 bucket where extracted tables are written in parquet format. """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.select('userId','firstName', 'lastname', 'gender', 'level').dropDuplicates()\ .where(df.userId.isNotNull()) # write users table to parquet files users_table.write.parquet(output + 'users/') # create UDF for timestamp column from original timestamp column @udf(TimestampType()) def conv_timestamp(ms): return datetime.fromtimestamp(ms / 1000.0) # Lets add one more column with correct usable time stamp format df = df.withColumn("start_time", conv_timestamp('ts')) # Create a dataframe which only has start_time log_time_data = df.select('start_time').dropDuplicates()\ .where(df.start_time.isNotNull()) # extract columns to create time table time_table = log_time_data.withColumn('hour',hour('start_time'))\ .withColumn('day',dayofmonth('start_time'))\ .withColumn('week', weekofyear('start_time'))\ .withColumn('month', month('start_time'))\ .withColumn('year',year('start_time'))\ .withColumn("weekday", date_format("start_time", 'E')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'times/') # create a view for the log_data and we already have the view for song_data as song created at the start df.createOrReplaceTempView('log_data_filtered_timeformatted') # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql( """SELECT monotonically_increasing_id() AS songplay_id, start_time, userId AS user_id, level, song_id, artist_id, sessionId AS session_id, location, userAgent AS user_agent FROM log_data_filtered_timeformatted JOIN song ON artist = artist_name AND song = title """) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def process_log_data(spark, input_data, output_data): """ Reads logs data in a dataframe which is then used to create new dataframes for creating users and time tables. Reads songs data and join it with logs dataframe to create a data for songplays table. Drop duplicates, rename columns and finally saves all tables in parquet format. :param spark: Spark session object :param input_data: S3 or local dir containing song data :param output_data: Path for parquet output files """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # S3 dir structure # log_data = input_data + "log_data/*.json" # local dir structure # read log data file logger.info('Reading log data json files') df = spark.read.json(log_data) # filter by actions for song plays df = df[df['page'] == 'NextSong'] # extract columns for users table users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']] users_table = users_table \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('firstName', 'first_name') \ .withColumnRenamed('lastName', 'last_name') \ .dropDuplicates() # write users table to parquet files logger.info('Writing users table in parquet format') users_table.write.parquet(output_data + '/tbl_users.parquet') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimestampType()) df = df.withColumn('start_time', get_timestamp(df.ts)) # create datetime columns from derived start_time column df = df.withColumn('hour', hour(df.start_time)) df = df.withColumn('day', dayofmonth(df.start_time)) df = df.withColumn('week', weekofyear(df.start_time)) df = df.withColumn('month', month(df.start_time)) df = df.withColumn('year', year(df.start_time)) df = df.withColumn('weekday', dayofweek(df.start_time)) # extract columns to create time table time_table = df[[ 'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday' ]] time_table = time_table.dropDuplicates() # write time table to parquet files partitioned by year and month logger.info( 'Writing time table partitioned by year and month in parquet format') time_table.write.partitionBy('year', 'month').parquet(output_data + '/tbl_time.parquet') # read in song data to use for songplays table logger.info("Reading song data for join") song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json') song_df = song_df.withColumnRenamed('year', 'song_year') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, song_df.artist_name == df.artist, 'inner') songplays_table = songplays_table.withColumn( "songplay_id", F.monotonically_increasing_id()) songplays_table = songplays_table[[ 'songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent', 'month', 'year' ]] songplays_table = songplays_table \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('sessionId', 'session_id') \ .withColumnRenamed('userAgent', 'user_agent') # write songplays table to parquet files partitioned by year and month logger.info( 'Writing songplays table partitioned by year and month in parquet format' ) songplays_table.write.partitionBy( 'year', 'month').parquet(output_data + '/tbl_songplays.parquet')
def process_log_data(spark, input_data, output_data): """ Process log_data from input_data path and save users, time and songplays tables in paquet format in output_data path Parameters: spark: SparkSession object to process data input_data: path to input data output_data: path to output data """ # get filepath to log data file log_data = input_data + 'log_data/*' # read log data file log_df = spark.read.json(log_data) # filter by actions for song plays log_df = log_df.filter('page = "NextSong"') \ .withColumn('user_id', log_df['userId'].cast('integer')) \ .withColumn('session_id', log_df['sessionId'].cast('integer')) \ .withColumnRenamed('firstName', 'first_name') \ .withColumnRenamed('lastName', 'last_name') # extract columns for users table users_table = log_df[['user_id', 'first_name', 'last_name', 'gender', 'level']] # write users table to parquet files users_table.where(users_table.user_id.isNotNull()).distinct().write.mode('overwrite').parquet(output_data + 'users/') # create timestamp column from original timestamp column time_df = log_df[['ts']] # create datetime column from original timestamp column time_df = time_df.withColumn('ts', to_timestamp(col('ts')/1000)) # extract columns to create time table time_table = time_df.withColumnRenamed('ts', 'start_time') \ .withColumn('hour', hour(col('start_time'))) \ .withColumn('day', dayofmonth(col('start_time'))) \ .withColumn('week', weekofyear(col('start_time'))) \ .withColumn('month', month(col('start_time'))) \ .withColumn('year', year(col('start_time'))) \ .withColumn('weekday', date_format(col('start_time'), 'u').cast('integer')) # write time table to parquet files partitioned by year and month time_table.distinct().write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time/') # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song_data/*/*/*') # extract columns from joined song and log datasets to create songplays table songplays_table = log_df.join(song_df, [log_df.song == song_df.title, log_df.artist == song_df.artist_name]) \ .selectExpr('monotonically_increasing_id() as songplay_id', \ 'to_timestamp(ts/1000) as start_time', \ 'month(to_timestamp(ts/1000)) as month', \ 'year(to_timestamp(ts/1000)) as year', \ 'user_id as user_id', \ 'level as level', \ 'song_id as song_id', \ 'artist_id as artist_id', \ 'session_id as session_id', \ 'location as location', \ 'userAgent as user_agent') # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data+'songplays/')
def process_log_data(spark, input_data, output_data): """ Function that read and transform log_data files to save user_table, time_table and songplays_table on S3 (in parquet extension) """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" #real path # log_data = input_data + "log_data/2018/11/2018-11-12-events.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where("page='NextSong'") # extract columns for users table user_table = df.select(col("userId").cast("int").alias("user_id"),\ col("firstName").alias("first_name"),\ col("lastName").alias("last_name"),"gender","level") user_table = user_table.dropDuplicates() # write users table to parquet files user_table.write.parquet(output_data + 'users/', 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: int(x) / 1000, IntegerType()) df = df.withColumn("timestamp", get_timestamp("ts")) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType()) df = df.withColumn("datetime", get_datetime("timestamp")) # print(df.limit(5).toPandas().head()) # extract columns to create time table time_table = df.select(col("timestamp").alias("start_time"),\ hour("datetime").alias("hour"),\ dayofmonth("datetime").alias("day"),\ weekofyear("datetime").alias("week"),\ month("datetime").alias("month"),\ year("datetime").alias("year"),\ date_format('datetime','E').alias('weekday') ) # print(time_table.limit(5).toPandas().head()) time_table = time_table.dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'time/', 'overwrite') # read in song data to use for songplays table # song_df = spark.read.json(input_data + "song_data/A/B/C/TRABCEI128F424C983.json") song_df = spark.read.json(input_data + "song_data/*/*/*/*.json") # extract columns from joined song and log datasets to create songplays table songplays_table = df.alias("a").join(song_df.alias("b"),\ (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration)).\ select(col("a.ts").alias("start_time"),col("a.userId").cast("int").alias("a.user_id"),"level",\ col("a.sessionId").alias("session_id"),"a.location","a.userAgent","b.song_id","b.artist_id") get_start_time = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType()) songplays_table = songplays_table.withColumn("start_time", get_start_time("start_time")) songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) songplays_table = songplays_table.withColumn("year", year("start_time")) songplays_table = songplays_table.withColumn("month", month("start_time")) # print(songplays_table.limit(5).toPandas().head()) songplays_table = songplays_table.dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( output_data + 'songplays/', 'overwrite')
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = input_data + "log-data/*/*/*.json" # read log data file print('Input log data json file read started') df = spark.read.json(log_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record') print('Input log data json file read completed') # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table print('users_table data exteaction started \n') users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() print('users_table data exteaction completed \n') # write users table to parquet files print('users_table data write started \n') users_table.write.parquet(output_data + "users_table/", mode="overwrite") print('users_table write Completed') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # create datetime column from original timestamp column #get_datetime = udf() #df = # extract columns to create time table print('time_table data extraction started \n') time_table=df.select('start_time').drop_duplicates() \ .withColumn('hour', hour(col('start_time'))) \ .withColumn('day', dayofmonth(col('start_time'))) \ .withColumn('week', weekofyear(col('start_time'))) \ .withColumn('month', month(col('start_time'))) \ .withColumn('year', year(col('start_time'))) \ .withColumn('weekday', dayofweek(col('start_time'))) #time_table.show() print('time_table data extraction completed \n') # write time table to parquet files partitioned by year and month print('time_table data write started \n') time_table.write.parquet(output_data + "time_table/", mode="overwrite") print('time_table data write Completed \n') # read in song data to use for songplays table song_df = spark.read.format("parquet").option( "basePath", os.path.join(output_data, "songs/")).load(os.path.join(output_data, "songs/*/*/")) # extract columns from joined song and log datasets to create songplays table print('songplays_table data extraction started \n') songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"), col("start_time"), col("userId").alias("user_id"), col("level"), col("song_id"), col("artist_id"), col("sessionId").alias("session_id"), col("location"), col("userAgent").alias("user_agent") ) print('songplays_table data extraction completed \n') # write songplays table to parquet files partitioned by year and month print('songplays_table data write started \n') songplays_table = songplays_table.write.parquet(output_data + "songplays/", mode="overwrite") print('songsplay_table data write completed \n')
# Dates and Timestamps from pyspark.sql import SparkSession spark= SparkSession.builder.appName('dates').getOrCreate() df= spark.read.csv('appl_stock.csv',header=True,inferSchema=True) df.head(1) df.show() df.select(['Date','Open']).show() # formato date: year-month-day hour from pyspark.sql.functions import (dayofmonth,hour, dayofyear,month, year,weekofyear, format_number,date_format) df.select(dayofmonth(df['Date'])).show() df.select(hour(df['Date'])).show() df.select(month(df['Date'])).show() df.select(year(df['Date'])).show() df.withColumn("Year",year(df['Date'])).show() # Añade al final la columna Year newdf=df.withColumn("Year",year(df['Date'])) newdf.groupBy("Year").mean().show() # Calcula un promedio de los valores de las columnas para cada año newdf.groupBy("Year").mean().select(["Year","avg(Close)" ]).show() # Calcula un promedio de los valores de las columnas para cada año mostrando solo las columnas Year y avg(Close) result=newdf.groupBy("Year").mean().select(["Year","avg(Close)" ]) result.show() result.withColumnRenamed("avg(Close)","Average Closing Price").show() # Cambiamos de nombre avg(Close) a Average Closing Price new= result.withColumnRenamed("avg(Close)","Average Closing Price") new.select(['Year',format_number('Average Closing Price',2)]).show() # La columna ahora se llama (Average Closing Price) sus valores tienen 2 decimales new.select(['Year',format_number('Average Closing Price',2).alias("avg(Close)")]).show() # Cambiamos el nombre a avg(Close) #--------#
# Total: 834648 # Unique: 11691 #totalCount = allAddId.select("advertisement_id").count() #uniqueCount = allAddId.select("advertisement_id").distinct().count() # Drop distance column allAdId = allAdId.drop("distance") # Create date column for indexing and aggregation allAdIdDate = allAdId.select("*", col("date_time").cast("date").alias("date")) # Create hour column for indexing and aggregation allAdIdHour = allAdIdDate.select( "*", hour("date_time").cast("int").alias("broadcast_hour")) #bukitBintang = allAdIdHour.toPandas() #bukitBintang.to_csv("bukit_bintang.csv", index = False) # Filter IDFAs (advertisement_id) that are from February 1st and horizontal # accuracy are within 30.00m or not null allAdIdFilter = allAdIdHour.filter((allAdIdHour.date == "2019-02-01") & ( (allAdIdHour.horizontal_accuracy <= 30) & (allAdIdHour.horizontal_accuracy.isNotNull()))) # Drop horizontal accuracy since it's redundant for future analysis allAdIdClean = allAdIdFilter.drop("horizontal_accuracy", "date", "latitude", "longitude") # Aggregate advertisement_id by hour
def process_log_data(spark, input_data, output_data): """ Description: This function loads log_data from S3 and processes it by extracting the songs and artist tables and then again loaded back to S3. Also output from previous function is used in by spark.read.json command Parameters: spark : Spark Session input_data : location of log_data json files with the events data output_data : S3 bucket were dimensional tables in parquet format will be stored """ # get filepath to log data file log_data = os.path.join(input_data, "log_data/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays actions_df = df.filter(df.page == 'NextSong') \ .select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() users_table.createOrReplaceTempView('users') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) actions_df = actions_df.withColumn('timestamp', get_timestamp(actions_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000))) actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts)) # extract columns to create time table time_table = actions_df.select('datetime') \ .withColumn('start_time', actions_df.datetime) \ .withColumn('hour', hour('datetime')) \ .withColumn('day', dayofmonth('datetime')) \ .withColumn('week', weekofyear('datetime')) \ .withColumn('month', month('datetime')) \ .withColumn('year', year('datetime')) \ .withColumn('weekday', dayofweek('datetime')) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month') \ .parquet(os.path.join(output_data, 'time/time.parquet'), 'overwrite') # read in song data to use for songplays table song_data = os.path.join(input_data, "song_data/*/*/*/*.json") song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table actions_df = actions_df.alias('log_df') song_df = song_df.alias('song_df') joined_df = actions_df.join( song_df, col('log_df.artist') == col('song_df.artist_name'), 'inner') songplays_table = joined_df.select( col('log_df.datetime').alias('start_time'), col('log_df.userId').alias('user_id'), col('log_df.level').alias('level'), col('song_df.song_id').alias('song_id'), col('song_df.artist_id').alias('artist_id'), col('log_df.sessionId').alias('session_id'), col('log_df.location').alias('location'), col('log_df.userAgent').alias('user_agent'), year('log_df.datetime').alias('year'), month('log_df.datetime').alias('month')) \ .withColumn('songplay_id', monotonically_increasing_id()) songplays_table.createOrReplaceTempView('songplays') # write songplays table to parquet files partitioned by year and month time_table = time_table.alias('timetable') songplays_table.write.partitionBy('year', 'month'). \ parquet(os.path.join(output_data, 'songplays/songplays.parquet'), \ 'overwrite') print("--- songplays.parquet completed ---") print("*** process_log_data completed ***\n\nEND")
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), StructField("FlightTime", IntegerType(), True), ]) input_path = "{}/data/simple_flight_delay_features_flight_times.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # 예정된 도착/출발 시간 추가 # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay", "FlightTime" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 수치 벡터 어셈블러 저장 vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복 # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # 새 모델을 이전 모델 위에 덮어쓰기 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터로 모델 평가 predictions = model.transform(test_data) # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 특징 중요도 수집 # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # 지표별 평균과 표준편차 평가 및 표로 출력 # import numpy as np score_averages = defaultdict(float) # 표 데이터 계산 average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # 표 출력 print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb")) # # 특징 중요도의 변화를 분석하고 보고 # # 각 특징에 대한 평균 계산 feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # 특징 중요도를 내림차순으로 정렬하고 출력 import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교 # # 특징 중요도 로그를 적재하거나 빈 로그를 초기화 try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # 각 특징에 대한 점수 변화를 계산하고 디스플레이 try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # 변동 값(delta) 계산 feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다 import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # 정렬된 특징 변동 값 디스플레이 print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # 로그에 기존 평균 변동 값을 추가 feature_log.append(feature_importance_entry) # 다음 실행을 위해 로그 유지 pickle.dump(feature_log, open(feature_log_filename, "wb"))
def process_log_data(spark, input_data, output_data): """ This function takes the log data from Udacity's S3 input file and processes it. This is done by extracting the user, time and songplay tables and then loading it back to the S3 buckegt I've created in AWS. Parameters: spark : Spark Session input_data : The S3 bucket location of song_data, think 'input' output_data : The S3 bucket location of the song_data, think 'ouput' """ #Using print statement to understand where in spark statement we are print("\n Taking in log data as variable from S3's input location....") # get full filepath to song data file #log_data = input_data + 'log_data/*/*/*.json' #utilizing exact folder set of data set to speed up execution in WorkSpace (please use commented out log_data variable above to run full etl with wildcards) log_data = input_data + 'log_data/2018/11/*.json' #Using print statement to understand where in spark statement we are print("\n Defining log Schema....") log_schema = Struct([SFld("artist", Str()), SFld("auth", Str()), SFld("firstName", Str()), SFld("gender", Str()), SFld("itemInSession", Lng()), SFld("lastName", Str()), SFld("length", Dbl()), SFld("level", Str()), SFld("location", Str()), SFld("method", Str()), SFld("page", Str()), SFld("registration", Dbl()), SFld("sessionId", Lng()), SFld("song", Str()), SFld("status", Str()), SFld("ts", Str()), SFld("userAgent", Str()), SFld("userId", Str())]) #Using print statement to understand where in spark statement we are print("\n Reading log data JSON files from S3's input location....") # read log data file df = spark.read.json(log_data, schema = log_schema, mode='PERMISSIVE', columnNameOfCorruptRecord='corruptRecord').drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Filtering page by NextSong....") # filter by actions for song plays df = df.filter(df.page == 'NextSong').drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Creating select statement for users data creation....") # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').where(df.userId != None).drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Writing parquet file for users table....") # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users_table/') #Using print statement to understand where in spark statement we are print("\n Creating timeStamp variable....") # create timestamp column from original timestamp column df = df.withColumn("timestamp", to_timestamp(from_unixtime(col("ts") / 1000))) #Using print statement to understand where in spark statement we are print("\n Creating select statement for time data creation....") # extract columns to create time table time_table = ( df.select("timestamp").withColumn("hour", hour("timestamp")).withColumn("day", dayofmonth("timestamp")) \ .withColumn("week", weekofyear("timestamp")).withColumn("weekday", dayofweek("timestamp")).withColumn("weekdayName", date_format("timestamp", "E")) \ .withColumn("month", month("timestamp")).withColumn("year", year("timestamp")).drop_duplicates() ) #Using print statement to understand where in spark statement we are print("\n Writing parquet file for time table and partitioned by year and month....") # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data + 'time_table/') #Using print statement to understand where in spark statement we are print("\n Reading song data JSON files from S3's input location....") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs_table/') #Using print statement to understand where in spark statement we are print("\n Creating select statement for song play data creation....") # extract columns from joined song and log datasets to create songplays table songplays_table = df.withColumn('songplayId', F.monotonically_increasing_id()).join(song_df, song_df.title == df.song) \ .select('songplayId', col('timestamp').alias('start_time'), col('userId'), 'level', 'song_id', 'artist_id', col('sessionId'), 'location', col('userAgent')) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.timestamp, how="inner")\ .select("songplayId", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year").drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Writing parquet file for song paly table and partitioned by year and month....") # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
def process_log_data(spark, input_data, output_data): ''' Process log data to build the user, time and songsplays tables and write them to parquet files Inputs: spark: spark session input_data: path to data files to extract the data output_data: path where the created tables will be stored ''' # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays actions_df = df.filter(df.page == 'NextSong').select( 'ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() # write users table to parquet files users.write.parquet((output_data + 'users/users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = actions_df.withColumn('timestamp', get_timestamp(actions_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000))) df = df.withColumn('start_time', get_datetime(df.ts)) # extract columns to create time table df = df.withColumn('hour', hour('start_time')) df = df.withColumn('day', dayofmonth('start_time')) df = df.withColumn('month', month('start_time')) df = df.withColumn('year', year('start_time')) df = df.withColumn('week', weekofyear('start_time')) df = df.withColumn('weekday', dayofweek('start_time')) time_table = df.select('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday').dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( (output_data + 'time/time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song_data/A/*/*/*.json') df = df.join(song_df, song_df.title == df.song) # extract columns from joined song and log datasets to create songplays table songplays_table = df.select( 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'ssessionId', 'location', 'userAgent').withColumn('songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( (output_data + 'songplays/songplays.parquet'), 'overwrite')
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
def process_log_data(spark, input_data, output_data): """Process user log data creating the tables user, time and songplays Args: spark (SparkSession): The spark session object input_data (str): The input files path output_data (str): The output files path """ # read log data file LOGGER.info('read log data file') log_df = spark.read.json(input_data) # filter by actions for song plays LOGGER.info('filter by actions for song plays') log_df = log_df.where(F.col('page') == 'NextSong') # extract columns for users table LOGGER.info('extract columns for users table') user_table = log_df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']) # write users table to parquet files LOGGER.info('write users table to parquet files') user_path = os.path.join(output_data, 'user') user_table.coalesce(1).write.mode('overwrite').parquet(user_path) # create datetime column from original timestamp column LOGGER.info('create datetime column from original timestamp column') get_timestamp = F.udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) log_df = log_df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table LOGGER.info('extract columns to create time table') time_table = log_df.select( 'start_time', F.hour('start_time').alias('hour'), F.dayofmonth('start_time').alias('day'), F.weekofyear('start_time').alias('weekofyear'), F.month('start_time').alias('month'), F.year('start_time').alias('year'), F.dayofweek('start_time').alias('weekday')).drop_duplicates( ['start_time']) # write time table to parquet partitioned by year and month LOGGER.info('write time table to parquet partitioned by year and month') time_table.coalesce(1).write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'time')) # read in song data to use for songplays table LOGGER.info('read in song data to use for songplays table') song_df = spark.read.parquet(os.path.join(output_data, 'song')) artist_df = spark.read.parquet(os.path.join(output_data, 'artist')) # join artist and song data LOGGER.info('join artist and song data') song_df = artist_df.select(['artist_name', 'artist_id'])\ .join(song_df, on='artist_id', how='inner') # extract columns from joined song and log datasets to create songplays LOGGER.info('extract columns from joined song and log datasets to create ' 'songplays') on_clause = \ (song_df.title == log_df.song) \ & (song_df.artist_name == log_df.artist) \ & (song_df.duration == log_df.length) songplays_table = log_df.join(song_df, on_clause, how='inner') # select columns and create year and month columns LOGGER.info('select columns and create year and month columns') songplays_table = songplays_table.select( 'start_time', F.col('userId').alias('user_id'), 'level', 'song_id', 'artist_id', F.col('itemInSession').alias('session_id'), 'location', F.col('userAgent').alias('user_agent'), F.month('start_time').alias('month'), F.year('start_time').alias('year')) # create songplay_id and drop duplicates by this column LOGGER.info('create songplay_id and drop duplicates by this column') key_columns = [ 'start_time', 'user_id', 'song_id', 'artist_id', 'session_id' ] songplays_table = songplays_table.withColumn( 'songplay_id', F.sha2(F.concat_ws("||", *key_columns), 256)).drop_duplicates(['songplay_id']) # write songplays table to parquet files partitioned by year and month LOGGER.info('write songplays table to parquet partitioned by year/month') songplays_table.coalesce(1).write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'songplays'))
t4 = t3.groupby('df2.hashtag', 'df1.hashtag').agg(f.count("df1.tweet_id").alias("count")) w = Window.partitionBy('df2.hashtag') t5 = t4.withColumn( 'max', f.max('count').over(w)).where(f.col('count') == f.col('max')).selectExpr( 'df2.hashtag as hashtag', 'df1.hashtag as other_combination_tag', 'count as other_tag_count') # writing data to db t5.write.jdbc(url=url, table="popular_tags_popular_combination", mode="overwrite", properties=properties) t5.show() # query 3 : find out the per hour frequency of popular #tag tweet for each location. w = Window.partitionBy("place", "hashtag", "date", "hour") per_hour_frequency = most_popular_tags.\ withColumn("date", f.to_date(f.col("created_at"))).\ withColumn("hour", f.hour(f.col("created_at"))).\ withColumn("tag_count", f.count('id').over(w)).\ select('place', 'date', 'hour', 'hashtag', 'tag_count').\ distinct().\ sort(f.asc('place'), f.asc('hashtag'), f.asc('date'), f.asc('hour'), f.desc('tag_count')) # storing data to db per_hour_frequency.write.jdbc(url=url, table="tags_frequency", mode="overwrite", properties=properties) per_hour_frequency.show()
def process_log_data(spark, input_data, output_data): ''' create user, time and songplays table ''' print("# extract log data") # get filepath to log data file log_data = input_data + "log-data/*/*/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") print("# process users") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").dropDuplicates() # write users table to parquet files users_table.write.mode("overwrite").parquet( os.path.join(output_data, "users")) print("# process time") # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.fromtimestamp(int(ts) / 1000.0), TimestampType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda ts: datetime.fromtimestamp(int(ts) / 1000.0), DateType()) df = df.withColumn("datetime", get_datetime(df.ts)) # extract columns to create time table time_table = df.select("ts", "timestamp", "datetime", hour(df.timestamp).alias("hour"), dayofmonth(df.timestamp).alias("day"), weekofyear(df.datetime).alias("week"), month(df.datetime).alias("month"), year(df.datetime).alias("year"), date_format(df.timestamp, "E").alias("weekday")).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").mode("overwrite").parquet( os.path.join(output_data, "time")) print("# extract song data") # read in song data to use for songplays table song_df = spark.read.json(input_data + "song-data/A/A/A/TRAAAAK128F9318786.json") song_df.createOrReplaceTempView("song_data") print("# process songsplays") # extract columns from joined song and log datasets to create songplays # table df.createOrReplaceTempView("log_data") songplays_table = spark.sql(""" SELECT DISTINCT row_number() OVER (PARTITION BY sd.song_id ORDER BY ld.userId DESC) as songplay_id, ts as start_time, month(timestamp) as month, year(timestamp) as year, ld.userId as user_id, ld.level, sd.song_id, sd.artist_id, ld.sessionId as session_id, ld.location, ld.userAgent as user_agent FROM log_data ld JOIN song_data sd ON ld.artist = sd.artist_name and ld.song = sd.title and ld.length = sd.duration """) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").mode("overwrite").parquet( os.path.join(output_data, "songplays")) print("ETL done.")
# MAGIC Using the DataFrame `not_found_df` you cached in the part (5a) and sorting by hour of the day in increasing order, create a DataFrame containing the number of requests that had a 404 return code for each hour of the day (midnight starts at 0). Cache the resulting DataFrame `hour_records_sorted_df` and print that as a list. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import hour hour_records_sorted_df = not_found_df.<FILL IN> print 'Top hours for 404 requests:\n' hour_records_sorted_df.show(24) # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import hour hour_records_sorted_df = not_found_df.groupBy(hour("time").alias("hr")).count().orderBy("hr").cache() print 'Top hours for 404 requests:\n' hour_records_sorted_df.show(24) # COMMAND ---------- # TEST Hourly 404 response codes (5h) errs_by_hour = [(row[0], row[1]) for row in hour_records_sorted_df.collect()] expected = [ (0, 175), (1, 171), (2, 422), (3, 272),
def spark_process(sqlContext, sc, validate, path_to_file): ###################### # # HDFS to DataFrame # ###################### ## all fields: # ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', # 'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', # 'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', # 'tolls_amount', 'total_amount'] # columns to select feature_columns = [1,2,3,5,6,9,10] # read file and convert to DataFrame # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache() customSchema = StructType([ StructField("vendor_id", StringType(), True), StructField("pickup_datetime", TimestampType(), True), StructField("dropoff_datetime", TimestampType(), True), StructField("passenger_count", StringType(), True), StructField("trip_distance", StringType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("rate_code", StringType(), True), StructField("store_and_fwd_flag", StringType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True), StructField("payment_type", StringType(), True), StructField("fare_amount", StringType(), True), StructField("surcharge", StringType(), True), StructField("mta_tax", StringType(), True), StructField("tip_amount", StringType(), True), StructField("tolls_amount", StringType(), True), StructField("total_amount", StringType(), True) ]) dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file) # create dataframe with selected columns dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns)) # this number does not include the header # number_of_trips = dataframe.count() sqlContext.clearCache() ###################### # # Preprocess data # ###################### # filter rows with null fields # if passenger count is missing assign it a value of 1 # filter invalid location: keep only areas near NYC dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \ .fillna(1,subset=["passenger_count"]) \ .filter(dataframe.pickup_latitude>40.0) \ .filter(dataframe.pickup_latitude<41.0) \ .filter(dataframe.pickup_longitude<-73.0) \ .filter(dataframe.pickup_longitude>-74.0) \ .filter(dataframe.dropoff_latitude>40.0) \ .filter(dataframe.dropoff_latitude<41.0) \ .filter(dataframe.dropoff_longitude<-73.0)\ .filter(dataframe.dropoff_longitude>-74.0) ###################### # # features engineering # ###################### # create new column based on time-delta (minutes) # convert pickup-datetime column to hour time_delta_udf = udf(time_delta_minutes,FloatType()) dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \ .withColumn('pick_up_hour', hour(dataframe.pickup_datetime)) dataframe = dataframe.select(dataframe.pick_up_hour, \ dataframe.passenger_count.cast("integer"), \ dataframe.pickup_longitude.cast("double"), \ dataframe.pickup_latitude.cast("double"), \ dataframe.dropoff_longitude.cast("double"),\ dataframe.dropoff_latitude.cast("double"), \ dataframe.time_delta.cast("double")) dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache() # split dataframe into feature and label vector # create feature vectors and labels for model training feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features') transformed = feature_assembler.transform(dataframe) vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache() ###################### # # train model # ###################### if validate: ################################ # # validate model on 60/40 split # ################################ # split training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0) decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(training) train_pred = model.transform(training) test_pred = model.transform(test) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_train = evaluator.evaluate(train_pred) evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_test = evaluator_test.evaluate(test_pred) output = test_pred.select("prediction", "label", "features") return output, r2_test, r2_train else: ################### # # train on all data # ################### decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(vector_dataframe) predictions = model.transform(vector_dataframe) output = predictions.select("prediction", "label", "features") ########################### # # process to send to Kafka # ########################### schema = StructType([StructField("prediction_mins", FloatType(), True), StructField("pick_up_hour", IntegerType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True)]) features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect() sqlContext.clearCache() dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache() return dataframe_from_prediction_vector