def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) #reviews_df.cache() helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter( reviews_df.total_votes > 1).filter( reviews_df.helpful_votes > 0).cache() helpful_df = helpful_df.withColumn( 'helpfulness_percentage', (helpful_df.helpful_votes / helpful_df.total_votes) * 100) helpful_df = helpful_df.select('helpfulness_percentage', 'star_rating') helpful_df = helpful_df.groupBy('star_rating').agg( functions.mean(helpful_df.helpfulness_percentage).alias( 'mean_helpfulness_percent')).select( 'star_rating', 'mean_helpfulness_percent').cache() helpful_df.repartition(1).write.mode('overwrite').csv( 'helpfulness_vs_rating') avg_helpfulness_dict = helpful_df.coalesce(1).rdd.collectAsMap() x_array = list(avg_helpfulness_dict.keys()) y_array = list(avg_helpfulness_dict.values()) plt.bar(x_array, y_array, color='g') plt.xlabel('Ratings') plt.ylabel('Average helpfulness') plt.title('Avgerage helpfuless vs rating') plt.show()
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA, encoding='utf-8') reviews_filtered = reviews_df.filter( reviews_df.product_id.isNotNull()).filter( reviews_df.total_votes > 1).filter(reviews_df.helpful_votes > 0) helpful_df = reviews_filtered.withColumn( 'helpfulness_percentage', (reviews_filtered.helpful_votes / reviews_filtered.total_votes) * 100).select('customer_id', 'helpfulness_percentage').cache() aggregations = [ functions.count('*').alias('num_reviews'), functions.mean('helpfulness_percentage').alias( 'mean_helpfulness_percentage') ] reviews_count_df = helpful_df.groupBy('customer_id').agg(*aggregations) reviews_count_df = reviews_count_df.filter( reviews_count_df.num_reviews > 150).filter( reviews_count_df.mean_helpfulness_percentage > 90) reviews_count_df.sort( 'num_reviews', ascending=False).write.mode('overwrite').csv('influential_reviewers')
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull()) aggregated_df = reviews_df.groupBy(reviews_df.product_category).agg(functions.avg('star_rating').alias('average')).cache() aggregated_df.write.mode('overwrite').csv('category_wise_average') aggregated_dict_high = aggregated_df.sort(aggregated_df.average, ascending=False).rdd.collectAsMap() x_values_high = list(aggregated_dict_high.keys()) labels_high = list(aggregated_dict_high.values()) y_values_high = range(len(labels_high)) plt.barh(y_values_high, labels_high) plt.yticks(y_values_high, x_values_high) plt.xlabel('Star Rating of an average product') plt.ylabel('Categories') plt.title('Categories wise average ratings') # aggregated_dict_low = aggregated_df.sort(aggregated_df.average).rdd.collectAsMap() # x_values_low = list(aggregated_dict_low.keys()) # labels_low = list(aggregated_dict_low.values()) # y_values_low = range(len(labels_low)) # # plt.subplot(2, 1, 2) # plt.barh(y_values_low, labels_low) # plt.yticks(y_values_low, x_values_low, rotation='30') # plt.xlabel('Star Rating of an average product') # plt.ylabel('Categories') # plt.title('Categories with lowest average rating') plt.show()
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark).dropna() #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA).dropna() reviews_df.cache() reviews_df = reviews_df.filter( reviews_df.product_id.isNotNull()).withColumn( 'length_words', word_count(reviews_df.review_body)) avg_length_df = reviews_df.groupBy('product_category').agg( functions.avg('length_words').alias('avg_length_words')).select( 'product_category', 'avg_length_words').cache() avg_length_df.repartition(1).write.mode('overwrite').csv( 'avg_length_per_category') avg_length_dict = avg_length_df.rdd.collectAsMap() avg_length_ddict = collections.OrderedDict( sorted(avg_length_dict.items(), reverse=True)) x_values_high = list(avg_length_ddict.keys()) labels_high = list(avg_length_ddict.values()) y_values_high = range(len(labels_high)) plt.barh(y_values_high, labels_high, color='g') plt.yticks(y_values_high, x_values_high) plt.xlabel('Average Review Length in words') plt.ylabel('Category') plt.title('Average Length across categories') plt.show()
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA, encoding='utf-8') helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter( reviews_df.total_votes > 1).filter( reviews_df.helpful_votes > 0).cache() helpful_df = helpful_df.withColumn( 'helpfulness_percentage', (helpful_df.helpful_votes / helpful_df.total_votes) * 100) helpful_df = helpful_df.withColumn( 'length_words', word_count(reviews_df.review_body)).select('helpfulness_percentage', 'star_rating', 'length_words') helpful_star_rating_corr = helpful_df.corr('helpfulness_percentage', 'star_rating') length_rating_corr = helpful_df.corr('length_words', 'star_rating') star_length_corr = helpful_df.corr('helpfulness_percentage', 'length_words') print(helpful_star_rating_corr) print(length_rating_corr) print(star_length_corr)
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark).dropna() #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) reviews_df.cache() helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(reviews_df.total_votes > 1).filter(reviews_df.helpful_votes > 0).cache() helpful_df = helpful_df.withColumn('helpfulness_percentage',(helpful_df.helpful_votes / helpful_df.total_votes) * 100) helpful_df = helpful_df.filter(helpful_df.helpfulness_percentage > 90).select('review_body') # take most helpful reviews helpful_df = helpful_df.withColumn('word_count', word_count(helpful_df.review_body)).cache() ## median of most helpful reviews median = helpful_df.approxQuantile('word_count', [0.5], 0.25) sc.parallelize(median).repartition(1).saveAsTextFile('median_words_in_helpfulreviews.csv') ## average, std of most helpful reviews aggregations = [functions.avg('word_count').alias('average_wordcount'), functions.stddev('word_count').alias('std_wordcount')] stats_df = helpful_df.agg(*aggregations) stats_df.repartition(1).write.mode('overwrite').csv('average_stddev_wordcount_mosthelpful') count = reviews_df.filter(reviews_df.customer_id.isNotNull()).select('customer_id').agg(functions.countDistinct('customer_id')) count.repartition(1).write.mode('overwrite').csv('total_customersin_dataset') product_count = reviews_df.filter(reviews_df.product_id.isNotNull()).select('product_id').agg(functions.countDistinct('product_id')) product_count.repartition(1).write.mode('overwrite').csv('total_productsin_dataset')
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) #Removing rows which have total votes or helpful votes 0 to avoid divide by zero error helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter( reviews_df.total_votes > 1).filter( reviews_df.helpful_votes > 0).cache() helpful_df = helpful_df.withColumn( 'helpfulness_percentage', (helpful_df.helpful_votes / helpful_df.total_votes) * 100) helpful_df = helpful_df.select('helpfulness_percentage', 'product_category') helpful_df = helpful_df.groupBy('product_category').agg( functions.mean(helpful_df.helpfulness_percentage).alias( 'mean_helpfulness_percent')) helpful_df.write.mode('overwrite').csv('category_mean_helpfulness') #Collecting is safe since we know the result is a very small dataset - rows = no of product categories aggregated_dict_high = helpful_df.sort(helpful_df.mean_helpfulness_percent, ascending=False).rdd.collectAsMap() x_values_high = list(aggregated_dict_high.keys()) labels_high = list(aggregated_dict_high.values()) y_values_high = range(len(labels_high)) plt.barh(y_values_high, labels_high) plt.yticks(y_values_high, x_values_high) plt.xlabel('Helpfulness percentage of an average review') plt.ylabel('Categories') plt.title('Categories with most helpful reviews') plt.show()
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) # nltk.data.path.append('/home/youruserid/nltk_data') # reviews_df = spark.read.parquet(inputs) reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull()) positive_reviews_df = reviews_df.filter(reviews_df.star_rating > 3) negative_reviews_df = reviews_df.filter(reviews_df.star_rating < 3) neutral_reviews_df = reviews_df.filter(reviews_df.star_rating == 3) save_adjectives(positive_reviews_df, 'positive_adjectives') save_adjectives(negative_reviews_df, 'negative_adjectives') save_adjectives(neutral_reviews_df, 'neutral_adjectivees')
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull()).select( reviews_df.star_rating).cache() ratings_dict = reviews_df.groupBy('star_rating').agg( functions.count('star_rating').alias('count')).rdd.collectAsMap() x_array = list(ratings_dict.keys()) y_array = list(ratings_dict.values()) plt.bar(x_array, y_array) plt.xlabel('Ratings of products on Amazon') plt.ylabel('count') plt.title('Count of each rating') plt.show() plt.savefig('../../figures/ratings_distribution.png')
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) #find average rating score over all reviews reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull()).select(reviews_df.star_rating).cache() mean = reviews_df.agg(functions.avg(reviews_df.star_rating).alias('mean')) mean.write.mode('overwrite').csv('average') #find median median = reviews_df.approxQuantile('star_rating', [0.5], 0.25) sc.parallelize(median).saveAsTextFile('median.csv') print("The median of the dataset is : " + str(median)) #find mode mode_dict = reviews_df.groupBy('star_rating').agg(functions.count('star_rating').alias('count')).rdd.collectAsMap() mode = keyWithMaxValue(mode_dict) mode = [mode] sc.parallelize(list(mode)).saveAsTextFile('mode.csv') print("The mode of the dataset: " + str(mode))
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark) #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) reviews_df.cache() product_count_df = reviews_df.filter(reviews_df.customer_id.isNotNull()).groupBy('product_category').agg(functions.countDistinct('product_id')) product_count_df.write.mode('overwrite').csv('product_count_categorywise') product_count = product_count_df.rdd.collectAsMap() aggregated_dict = collections.OrderedDict(sorted(product_count.items(), reverse=True)) x_values_high = list(aggregated_dict.keys()) labels_high = list(aggregated_dict.values()) y_values_high = range(len(labels_high)) plt.barh(y_values_high, labels_high, color='g') plt.yticks(y_values_high, x_values_high) plt.xlabel('Number of distinct products') plt.ylabel('Category') plt.title('Product count across categories') plt.show()
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark).dropna() #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA).dropna() reviews_df.cache() reviews_df = reviews_df.filter( reviews_df.product_id.isNotNull()).withColumn( 'length_words', word_count(reviews_df.review_body)) avg_length_df = reviews_df.groupBy('star_rating').agg( functions.avg('length_words').alias('avg_length_words')).select( 'star_rating', 'avg_length_words').cache() avg_length_df.write.mode('overwrite').csv('avg_length_per_rating') avg_length_dict = avg_length_df.rdd.collectAsMap() x_array = list(avg_length_dict.keys()) y_array = list(avg_length_dict.values()) plt.bar(x_array, y_array, color='g') plt.xlabel('Ratings of products on Amazon') plt.ylabel('Avg length in words') plt.title('Avgerage length vs Ratings') plt.show()