Example #1
0
    def get_topic_distribution(self, review):
        """

        :type review: str
        """
        review_bow = lda_context_utils.create_bag_of_words([review])
        dictionary = corpora.Dictionary(review_bow)
        corpus = dictionary.doc2bow(review_bow[0])
        lda_corpus = self.lda_model.get_document_topics(corpus)

        topic_distribution =\
            lda_document_to_topic_distribution(lda_corpus, self.num_topics)

        return topic_distribution
Example #2
0
    def get_topic_distribution(self, review):
        """

        :type review: str
        """
        review_bow = lda_context_utils.create_bag_of_words([review])
        dictionary = corpora.Dictionary(review_bow)
        corpus = dictionary.doc2bow(review_bow[0])
        lda_corpus = self.lda_model.get_document_topics(corpus)

        topic_distribution =\
            lda_document_to_topic_distribution(lda_corpus, self.num_topics)

        return topic_distribution
    def get_context_rich_topics(self):
        """
        Returns a list with the topics that are context rich and their
        specific/generic frequency ratio

        :rtype: list[(int, float)]
        :return: a list of pairs where the first position of the pair indicates
        the topic and the second position indicates the specific/generic
        frequency ratio
        """
        self.separate_reviews()

        specific_reviews_text =\
            context_utils.get_text_from_reviews(self.specific_reviews)
        generic_reviews_text =\
            context_utils.get_text_from_reviews(self.generic_reviews)

        specific_bow =\
            lda_context_utils.create_bag_of_words(specific_reviews_text)
        generic_bow =\
            lda_context_utils.create_bag_of_words(generic_reviews_text)

        specific_dictionary = corpora.Dictionary(specific_bow)
        specific_dictionary.filter_extremes()
        specific_corpus =\
            [specific_dictionary.doc2bow(text) for text in specific_bow]

        generic_dictionary = corpora.Dictionary(generic_bow)
        generic_dictionary.filter_extremes()
        generic_corpus =\
            [generic_dictionary.doc2bow(text) for text in generic_bow]

        self.topic_model = ldamodel.LdaModel(
            specific_corpus, id2word=specific_dictionary,
            num_topics=self.num_topics, minimum_probability=self.epsilon)
            # num_topics=self.num_topics, minimum_probability=self.epsilon,
            # passes=10, iterations=500)
        # print('super trained')

        lda_context_utils.update_reviews_with_topics(
            self.topic_model, specific_corpus, self.specific_reviews)
        lda_context_utils.update_reviews_with_topics(
            self.topic_model, generic_corpus, self.generic_reviews)

        topic_ratio_map = {}
        ratio_topics = 0

        for topic in range(self.num_topics):
            weighted_frq = lda_context_utils.calculate_topic_weighted_frequency(
                topic, self.reviews)
            specific_weighted_frq = \
                lda_context_utils.calculate_topic_weighted_frequency(
                    topic, self.specific_reviews)
            generic_weighted_frq = \
                lda_context_utils.calculate_topic_weighted_frequency(
                    topic, self.generic_reviews)

            if weighted_frq < self.alpha:
                continue

            # print('specific_weighted_frq', specific_weighted_frq)
            # print('generic_weighted_frq', generic_weighted_frq)

            ratio = (specific_weighted_frq + 1) / (generic_weighted_frq + 1)

            if ratio < self.beta:
                continue

            ratio_topics += 1
            topic_ratio_map[topic] = ratio

        sorted_topics = sorted(
            topic_ratio_map.items(), key=operator.itemgetter(1), reverse=True)

        # for topic in sorted_topics:
        #     topic_index = topic[0]
        #     ratio = topic[1]
        #     print('topic', ratio, topic_index, self.topic_model.print_topic(topic_index, topn=50))

        # print('num_topics', len(self.topics))
        # print('ratio_topics', ratio_topics)

        return sorted_topics