def get_topic_distribution(self, review): """ :type review: str """ review_bow = lda_context_utils.create_bag_of_words([review]) dictionary = corpora.Dictionary(review_bow) corpus = dictionary.doc2bow(review_bow[0]) lda_corpus = self.lda_model.get_document_topics(corpus) topic_distribution =\ lda_document_to_topic_distribution(lda_corpus, self.num_topics) return topic_distribution
def get_context_rich_topics(self): """ Returns a list with the topics that are context rich and their specific/generic frequency ratio :rtype: list[(int, float)] :return: a list of pairs where the first position of the pair indicates the topic and the second position indicates the specific/generic frequency ratio """ self.separate_reviews() specific_reviews_text =\ context_utils.get_text_from_reviews(self.specific_reviews) generic_reviews_text =\ context_utils.get_text_from_reviews(self.generic_reviews) specific_bow =\ lda_context_utils.create_bag_of_words(specific_reviews_text) generic_bow =\ lda_context_utils.create_bag_of_words(generic_reviews_text) specific_dictionary = corpora.Dictionary(specific_bow) specific_dictionary.filter_extremes() specific_corpus =\ [specific_dictionary.doc2bow(text) for text in specific_bow] generic_dictionary = corpora.Dictionary(generic_bow) generic_dictionary.filter_extremes() generic_corpus =\ [generic_dictionary.doc2bow(text) for text in generic_bow] self.topic_model = ldamodel.LdaModel( specific_corpus, id2word=specific_dictionary, num_topics=self.num_topics, minimum_probability=self.epsilon) # num_topics=self.num_topics, minimum_probability=self.epsilon, # passes=10, iterations=500) # print('super trained') lda_context_utils.update_reviews_with_topics( self.topic_model, specific_corpus, self.specific_reviews) lda_context_utils.update_reviews_with_topics( self.topic_model, generic_corpus, self.generic_reviews) topic_ratio_map = {} ratio_topics = 0 for topic in range(self.num_topics): weighted_frq = lda_context_utils.calculate_topic_weighted_frequency( topic, self.reviews) specific_weighted_frq = \ lda_context_utils.calculate_topic_weighted_frequency( topic, self.specific_reviews) generic_weighted_frq = \ lda_context_utils.calculate_topic_weighted_frequency( topic, self.generic_reviews) if weighted_frq < self.alpha: continue # print('specific_weighted_frq', specific_weighted_frq) # print('generic_weighted_frq', generic_weighted_frq) ratio = (specific_weighted_frq + 1) / (generic_weighted_frq + 1) if ratio < self.beta: continue ratio_topics += 1 topic_ratio_map[topic] = ratio sorted_topics = sorted( topic_ratio_map.items(), key=operator.itemgetter(1), reverse=True) # for topic in sorted_topics: # topic_index = topic[0] # ratio = topic[1] # print('topic', ratio, topic_index, self.topic_model.print_topic(topic_index, topn=50)) # print('num_topics', len(self.topics)) # print('ratio_topics', ratio_topics) return sorted_topics