Example #1
0
def test_clean_string_column(df):
    """
    Tests operations.clean_string_column .
    Args:
        :param df: DataFrame object from fixture.
    """
    ops = Operations()
    df =ops.clean_string_column(df, 'pagename')
    assert df.where(df['pageviews']==60).select("pagename").first()['pagename'] == "john person s first 100 days"
Example #2
0
def test_append_tokens(df):
    """
    Tests operations.append_tokens.
    Args:
        :param df: DataFrame object from fixture.
    """
    ops = Operations()
    df =ops.clean_string_column(df, 'pagename')
    df = ops.append_tokens(df)
    #The stop word 'first' is removed
    # see for a list of stop words: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
    assert df.where(df['pageviews']==60).select("tokens").first()['tokens'] == ['john', 'person', '100', 'day']
Example #3
0
sc._jsc.hadoopConfiguration().set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "###")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "#####")


sqlContext = HiveContext(sc)


from operations import Operations

ops = Operations()
#Create the dataframe from the lines RDD
df = ops.create_dataframe(lines, sqlContext)
#Clean the 'pagename' column of encoded characters
df = ops.clean_string_column(df, 'pagename')
#Add columns for hour, day, month, year from the file name
df = ops.append_date_columns(df)

#Group by timeframes
hour_df, day_df, month_df, year_df = ops.aggregate_times(df)
#Create tokens from the pagename
hour_df = ops.append_tokens(hour_df)
#Add term frequency and inverse document frequency
hour_df = ops.append_tf_idf(hour_df)
#Create ranking
hour_df, day_df, month_df, year_df = ops.append_ranks(hour_df, day_df, month_df, year_df)

#Get the top 200 for each timeframe
top_hourly = hour_df.filter(hour_df['hour_rank']<201)
top_daily =  day_df.filter(day_df['day_rank']<201)