Example #1
0
            yield sorted(combination), 1

    def reducer_jaccard(self, user_pair, value):
        pair_list = [p for p in user_pair]
        jaccard = sum(value) * 1.0 / (
            (pair_list[0][1] + pair_list[1][1]) - sum(value))
        yield [pair_list[0][0], pair_list[1][0]], jaccard

    def reducer_similar_pairs(self, key, values):
        similarity = sum(values)
        if similarity >= 0.5:
            num = len(similar_user_pairs) + 1
            similar_user_pairs[num] = key
            yield "Pair:", key

    def steps(self):
        return [
            MRStep(mapper=self.mapper_user_ids),
            MRStep(reducer=self.reducer_reviews_per_user),
            MRStep(reducer=self.reducer_pairs),
            MRStep(reducer=self.reducer_jaccard),
            MRStep(reducer=self.reducer_similar_pairs)
        ]


if __name__ == '__main__':
    start = time.time()
    SimilarUsers.run()
    output_to_csv.make_csv('similar_users', similar_user_pairs)
    end = time.time()
    print("Time: " + str(end - start) + "sec")
Example #2
0
            yield associated_text_list[0], 1

    def reducer_sum_uniques_in_text(self, associated_text, uniques):
        yield "Unique", [associated_text, sum(uniques)]

    def reducer_max_words_used_once(self, uniques, all_info):
        text = ""
        biggest_sum = 50

        for info in all_info:
            if info[1] > biggest_sum:
                most_unique_comments[info[0]] = info[1]
                yield info[1], info[0]
        #     if info[1] > biggest_sum:
        #         biggest_sum = info[1]
        #         text = info[0]
        # yield biggest_sum, text

    def steps(self):
        return [MRStep(mapper=self.mapper_text_by_word),
                MRStep(reducer=self.reducer_uniques_in_text),
                MRStep(reducer=self.reducer_sum_uniques_in_text),
                MRStep(reducer=self.reducer_max_words_used_once)]


if __name__ == '__main__':
    start = time.time()
    UniqueReview.run()
    output_to_csv.make_csv('unique_comments', most_unique_comments)
    end = time.time()
    print("Time: " + str(end - start) + "sec")
    def reducer_similarity(self, user_pair, rating_pairs):

        total_dividend = 0
        total_divider_a = 0
        total_divider_b = 0
        for pair in rating_pairs:
            rating_a = pair[0]
            rating_b = pair[1]
            total_dividend += (rating_a*rating_b)
            total_divider_a += (rating_a*rating_a)
            total_divider_b += (rating_b*rating_b)
        similarity = total_dividend * 1.0/(math.sqrt(total_divider_a)*math.sqrt(total_divider_b))
        if similarity > 0.8:
            num = len(similar_users) + 1
            similar_users[num] = user_pair
            yield "Pair", user_pair

    def steps(self):
        return [MRStep(mapper=self.mapper_user_data),
                MRStep(reducer=self.reducer_user_pairs),
                MRStep(reducer=self.reducer_pair_ratings),
                MRStep(reducer=self.reducer_similarity)]


if __name__ == '__main__':
    start = time.time()
    SimilarUsersRatings.run()
    output_to_csv.make_csv('similar_users_ratings', similar_users)
    end = time.time()
    print("Time: " + str(end - start) + "sec")
        most_popular_user = None
        max_popularity = 0
        for user, vote_list in user_dict.items():
            review_count = len(vote_list)
            review_popularity = sum(vote_list)
            # assumption of count being more important, with popularity average adding onto it
            # a single extremely useful or popular review is similar to many irrelevant reviews
            popularity = review_count + (review_popularity / review_count)
            if popularity > max_popularity:
                max_popularity = popularity
                most_popular_user = user

        categorey_reviews[category] = most_popular_user

        yield category, most_popular_user

    def steps(self):
        return [
            MRStep(mapper=self.mapper_user_category),
            MRStep(reducer=self.reducer_join_business_review),
            MRStep(reducer=self.reducer_categorize_user_votes),
            MRStep(reducer=self.reducer_most_popular_user)
        ]


if __name__ == '__main__':
    start = time.time()
    PopularUsersPerCategory.run()
    output_to_csv.make_csv('popular_users_per_category', categorey_reviews)
    end = time.time()
    print("Time: " + str(end - start) + "sec")
Example #5
0
        for value in values:
            if value[0] == 'A':
                reviews.append(value)
            if value[0] == 'B':
                for review in reviews:
                    full_review = review[1:] + value[1:]
                    yield business_id, full_review

    def reducer_categorize_stars(self, business_id, reviews):

        for review in reviews:
            review_id = review[0]
            stars = review[1]
            categories = review[2]
            for category in categories:
                categorey_stars[review_id] = [category, stars]
                yield review_id, [category, stars]


    def steps(self):
        return [MRStep(mapper=self.mapper_stars_category),
                MRStep(reducer=self.reducer_join_business_review),
                MRStep(reducer=self.reducer_categorize_stars)]


if __name__ == '__main__':
    start = time.time()
    StarsPerCategory.run()
    output_to_csv.make_csv('stars_per_category_30000', categorey_stars)
    end = time.time()
    print("Time: " + str(end - start) + "sec")