def test_clean_string_column(df): """ Tests operations.clean_string_column . Args: :param df: DataFrame object from fixture. """ ops = Operations() df =ops.clean_string_column(df, 'pagename') assert df.where(df['pageviews']==60).select("pagename").first()['pagename'] == "john person s first 100 days"
def test_append_tokens(df): """ Tests operations.append_tokens. Args: :param df: DataFrame object from fixture. """ ops = Operations() df =ops.clean_string_column(df, 'pagename') df = ops.append_tokens(df) #The stop word 'first' is removed # see for a list of stop words: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words assert df.where(df['pageviews']==60).select("tokens").first()['tokens'] == ['john', 'person', '100', 'day']
sc._jsc.hadoopConfiguration().set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "###") sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "#####") sqlContext = HiveContext(sc) from operations import Operations ops = Operations() #Create the dataframe from the lines RDD df = ops.create_dataframe(lines, sqlContext) #Clean the 'pagename' column of encoded characters df = ops.clean_string_column(df, 'pagename') #Add columns for hour, day, month, year from the file name df = ops.append_date_columns(df) #Group by timeframes hour_df, day_df, month_df, year_df = ops.aggregate_times(df) #Create tokens from the pagename hour_df = ops.append_tokens(hour_df) #Add term frequency and inverse document frequency hour_df = ops.append_tf_idf(hour_df) #Create ranking hour_df, day_df, month_df, year_df = ops.append_ranks(hour_df, day_df, month_df, year_df) #Get the top 200 for each timeframe top_hourly = hour_df.filter(hour_df['hour_rank']<201) top_daily = day_df.filter(day_df['day_rank']<201)