yield sorted(combination), 1 def reducer_jaccard(self, user_pair, value): pair_list = [p for p in user_pair] jaccard = sum(value) * 1.0 / ( (pair_list[0][1] + pair_list[1][1]) - sum(value)) yield [pair_list[0][0], pair_list[1][0]], jaccard def reducer_similar_pairs(self, key, values): similarity = sum(values) if similarity >= 0.5: num = len(similar_user_pairs) + 1 similar_user_pairs[num] = key yield "Pair:", key def steps(self): return [ MRStep(mapper=self.mapper_user_ids), MRStep(reducer=self.reducer_reviews_per_user), MRStep(reducer=self.reducer_pairs), MRStep(reducer=self.reducer_jaccard), MRStep(reducer=self.reducer_similar_pairs) ] if __name__ == '__main__': start = time.time() SimilarUsers.run() output_to_csv.make_csv('similar_users', similar_user_pairs) end = time.time() print("Time: " + str(end - start) + "sec")
yield associated_text_list[0], 1 def reducer_sum_uniques_in_text(self, associated_text, uniques): yield "Unique", [associated_text, sum(uniques)] def reducer_max_words_used_once(self, uniques, all_info): text = "" biggest_sum = 50 for info in all_info: if info[1] > biggest_sum: most_unique_comments[info[0]] = info[1] yield info[1], info[0] # if info[1] > biggest_sum: # biggest_sum = info[1] # text = info[0] # yield biggest_sum, text def steps(self): return [MRStep(mapper=self.mapper_text_by_word), MRStep(reducer=self.reducer_uniques_in_text), MRStep(reducer=self.reducer_sum_uniques_in_text), MRStep(reducer=self.reducer_max_words_used_once)] if __name__ == '__main__': start = time.time() UniqueReview.run() output_to_csv.make_csv('unique_comments', most_unique_comments) end = time.time() print("Time: " + str(end - start) + "sec")
def reducer_similarity(self, user_pair, rating_pairs): total_dividend = 0 total_divider_a = 0 total_divider_b = 0 for pair in rating_pairs: rating_a = pair[0] rating_b = pair[1] total_dividend += (rating_a*rating_b) total_divider_a += (rating_a*rating_a) total_divider_b += (rating_b*rating_b) similarity = total_dividend * 1.0/(math.sqrt(total_divider_a)*math.sqrt(total_divider_b)) if similarity > 0.8: num = len(similar_users) + 1 similar_users[num] = user_pair yield "Pair", user_pair def steps(self): return [MRStep(mapper=self.mapper_user_data), MRStep(reducer=self.reducer_user_pairs), MRStep(reducer=self.reducer_pair_ratings), MRStep(reducer=self.reducer_similarity)] if __name__ == '__main__': start = time.time() SimilarUsersRatings.run() output_to_csv.make_csv('similar_users_ratings', similar_users) end = time.time() print("Time: " + str(end - start) + "sec")
most_popular_user = None max_popularity = 0 for user, vote_list in user_dict.items(): review_count = len(vote_list) review_popularity = sum(vote_list) # assumption of count being more important, with popularity average adding onto it # a single extremely useful or popular review is similar to many irrelevant reviews popularity = review_count + (review_popularity / review_count) if popularity > max_popularity: max_popularity = popularity most_popular_user = user categorey_reviews[category] = most_popular_user yield category, most_popular_user def steps(self): return [ MRStep(mapper=self.mapper_user_category), MRStep(reducer=self.reducer_join_business_review), MRStep(reducer=self.reducer_categorize_user_votes), MRStep(reducer=self.reducer_most_popular_user) ] if __name__ == '__main__': start = time.time() PopularUsersPerCategory.run() output_to_csv.make_csv('popular_users_per_category', categorey_reviews) end = time.time() print("Time: " + str(end - start) + "sec")
for value in values: if value[0] == 'A': reviews.append(value) if value[0] == 'B': for review in reviews: full_review = review[1:] + value[1:] yield business_id, full_review def reducer_categorize_stars(self, business_id, reviews): for review in reviews: review_id = review[0] stars = review[1] categories = review[2] for category in categories: categorey_stars[review_id] = [category, stars] yield review_id, [category, stars] def steps(self): return [MRStep(mapper=self.mapper_stars_category), MRStep(reducer=self.reducer_join_business_review), MRStep(reducer=self.reducer_categorize_stars)] if __name__ == '__main__': start = time.time() StarsPerCategory.run() output_to_csv.make_csv('stars_per_category_30000', categorey_stars) end = time.time() print("Time: " + str(end - start) + "sec")