コード例 #1
0
ファイル: wiki_stats.py プロジェクト: ari99/wiki_stats
from operations import Operations

ops = Operations()
#Create the dataframe from the lines RDD
df = ops.create_dataframe(lines, sqlContext)
#Clean the 'pagename' column of encoded characters
df = ops.clean_string_column(df, 'pagename')
#Add columns for hour, day, month, year from the file name
df = ops.append_date_columns(df)

#Group by timeframes
hour_df, day_df, month_df, year_df = ops.aggregate_times(df)
#Create tokens from the pagename
hour_df = ops.append_tokens(hour_df)
#Add term frequency and inverse document frequency
hour_df = ops.append_tf_idf(hour_df)
#Create ranking
hour_df, day_df, month_df, year_df = ops.append_ranks(hour_df, day_df, month_df, year_df)

#Get the top 200 for each timeframe
top_hourly = hour_df.filter(hour_df['hour_rank']<201)
top_daily =  day_df.filter(day_df['day_rank']<201)
top_monthly =  month_df.filter(month_df['month_rank']<201)
top_yearly =  year_df.filter(year_df['year_rank']<201)

#Create files on s3 with the results
ops.make_plot_csv(top_hourly,"hourly")
ops.make_plot_csv(top_daily,"daily")
ops.make_plot_csv(top_monthly,"monthly")
ops.make_plot_csv(top_yearly,"yearly")