def df(spark_context, hive_context): """ Fixture for creative a test dataframe. Args: :param spark_context: SparkContext object from fixture. :param hive_context: HiveContext object from fixture. Returns: :return: DataFrame object. """ input = ['ace Beubiri 10 12744', 'ace Bhutan 20 31284', 'ace Bireu%c3%abn 30 20356', 'ace Bireuen 40 20347', 'ace Bishkek 50 14665', 'ace John_Person%27s_first_100_days 60 14576', 'ace Bolivia 70 32058', 'ace Bosnia_H%c3%a8rz%c3%a8govina 80 38777'] rdd = spark_context.parallelize(input) ops = Operations() df = ops.create_dataframe(rdd, hive_context) return df
sc = SparkContext(appName="wikistats") lines = sc.textFile("s3n://my.wiki.bucket.com/wikidata") sc._jsc.hadoopConfiguration().set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "###") sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "#####") sqlContext = HiveContext(sc) from operations import Operations ops = Operations() #Create the dataframe from the lines RDD df = ops.create_dataframe(lines, sqlContext) #Clean the 'pagename' column of encoded characters df = ops.clean_string_column(df, 'pagename') #Add columns for hour, day, month, year from the file name df = ops.append_date_columns(df) #Group by timeframes hour_df, day_df, month_df, year_df = ops.aggregate_times(df) #Create tokens from the pagename hour_df = ops.append_tokens(hour_df) #Add term frequency and inverse document frequency hour_df = ops.append_tf_idf(hour_df) #Create ranking hour_df, day_df, month_df, year_df = ops.append_ranks(hour_df, day_df, month_df, year_df) #Get the top 200 for each timeframe