def get_cluster_centroid(self, documents, idf_array, threshold_calc): """ The centroid for the cluster is the vector for the pseudo-document for the cluster, cf. MEAD paper :param: documents, idf_array, threshold (optional) :return: numpy array """ word_sentence_matrix = Vectors().get_topic_matrix(documents).toarray() total_words_in_cluster = word_sentence_matrix.sum(0) sentences_per_word = np.count_nonzero(word_sentence_matrix, axis=0) # across the cluster average_count = np.divide(total_words_in_cluster, sentences_per_word + 1) if len(average_count) != len(idf_array): raise Exception("Cluster centroid arrays must be the same length; " "Word array length {}, IDF array length {}" .format(len(average_count), len(idf_array))) centroid_cluster = np.multiply(average_count, idf_array) if threshold_calc == 'min': threshold = self.min_mean_threshold(centroid_cluster) elif threshold_calc == 'mean': threshold = self.mean_threshold(centroid_cluster) elif threshold_calc == 'max': threshold = self.max_mean_threshold(centroid_cluster) else: threshold = 0 centroid_cluster[centroid_cluster < threshold] = 0 # set all centroid word values below threshold to zero return centroid_cluster
def test_generate_summary(self): topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() for topic_id, documents in topics.items(): summarizer = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) summary = summarizer.generate_summary(idf) self.assertIsNot(summary, None)
def test_melda_generate_summary(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) for topic_id, documents in self.topics.items(): summarizer = MeldaSummaryGenerator(documents, MeldaContentSelector(), self.args) summary = summarizer.generate_summary(self.idf) self.assertIsNot(summary, None)
def test_document_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) testtok = ['puppy', 'soldier', 'war', 'fetch'] testsen = Vectors().create_term_sen_freq(testtok) document_topics = lda_model.get_document_topics(testsen, minimum_probability=0) topic_dist = [prob[1] for prob in document_topics] self.assertEqual(len(topic_dist), self.args.lda_topics) self.assertAlmostEquals(sum(topic_dist), 1, 2)
def test_get_top_n(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf) sentences = selector.calculate_lda_scores(sentences, lda_model) sentences = selector.calculate_melda_scores(sentences) selector.select_top_n(sentences, self.args.lda_topics, 1) self.assertEqual(len(selector.selected_content), self.args.lda_topics)
def test_melda_info_ordering(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) summarizer = MeldaSummaryGenerator(self.doc_list, MeldaContentSelector(), self.args) content_selector = summarizer.select_content(self.idf) expected_len = len(content_selector) summarizer.order_information() actual_len = len(content_selector) self.assertEqual(expected_len, actual_len)
def test_realize_content(self): """ Test applying redundancy penalty during realize_content :return: """ expected_content = "I took my small puppy to the dog park today.\n" \ "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \ "There were many bigger puppies but he didn't get in a fight with any of them, " \ "they just played together with their toys and chased each other.\n" \ "They all ran around with their tails wagging and their tongues hanging out having " \ "loads of fun in the sun.\n" \ "He loves playing so he liked to run around with the other dogs playing fetch.\n" \ "Puppies love playing fetch." WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args) generator.select_content(self.idf) generator.order_information() generator.content_selector.selected_content = generator.content_selector.selected_content realized_content = generator.realize_content() self.assertEqual(expected_content, realized_content)
def test_order_information(self): """ Test ordering Sentences by MEAD score :return: """ doc_id_1 = 'TST_ENG_20190101.0001' sentence_1 = 'Puppies love playing fetch.' sentence_2 = 'They all ran around with their tails wagging ' \ 'and their tongues hanging out having loads of fun in the sun.' sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch." expected_info = [ Sentence(sentence_1, 1, doc_id_1), Sentence(sentence_3, 3, doc_id_1), Sentence(sentence_2, 2, doc_id_1) ] WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args) generator.select_content(self.idf) generator.order_information() first_sentences = generator.content_selector.selected_content[:3] self.assertListEqual(expected_info, first_sentences)
def get_idf_array(self): """ Use external corpus to get IDF scores for cluster centroid calculations :return: numpy array of idf values """ corpus = brown if self.args.corpus == 'R': corpus = reuters num_words = Vectors().num_unique_words n = len(corpus.fileids()) # number of documents in corpus docs_word_matrix = np.zeros([n, num_words]) for doc_idx, doc_id in enumerate(corpus.fileids()): sentences = list(corpus.sents(doc_id)) words_in_doc = set() for s in sentences: s = ' '.join(s) proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s)) if proc_s: words_in_doc = words_in_doc.union(proc_s) for word in words_in_doc: word_idx = WordMap.id_of(word) if word_idx: docs_word_matrix[doc_idx, word_idx] = 1 docs_per_word = np.sum(docs_word_matrix, axis=0) self.idf_array = np.log10(np.divide(n, docs_per_word + 1)) # add one to avoid divide by zero error return self.idf_array
def test_create_freq_vectors(self): Vectors().create_freq_vectors(self.topics) for doc_list in self.topics.values(): for doc in doc_list: # check that there's a vector for each sentence doc_matrix_shape = doc.vectors.get_shape() expected_rows = 3 self.assertEqual(doc_matrix_shape[0], expected_rows)
def test_get_lda_scores(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentence = self.doc_list[0].sens[0] selector.calculate_lda_scores([sentence], lda_model) lda_scores = sentence.lda_scores self.assertEqual(len(lda_scores), self.args.lda_topics) self.assertAlmostEqual(sum(lda_scores), 1, 2)
def test_get_cluster_centroid(self): selector = MeadContentSelector() WordMap.word_set = self.w_set WordMap.word_to_id = self.w_map Vectors().create_freq_vectors(self.topics) centroid = selector.get_cluster_centroid(self.doc_list, self.idf, self.args.c_threshold) actual_non_zero = np.count_nonzero(centroid) should_be_non_zero = 29 self.assertEqual(actual_non_zero, should_be_non_zero)
def test_select_content(self): selector = MeadContentSelector() Vectors().create_freq_vectors(self.topics) selected = selector.select_content(self.topics['PUP1A'], self.args, self.idf) top_sentence = selected[0] expected_top_sentence = 'In a park somewhere, a bunch of ' \ 'puppies played fetch with their owners today.' top_mead_score = float("{:.5f}".format(top_sentence.mead_score)) expected_top_mead_score = 2.40038 self.assertEqual(top_sentence.raw_sentence, expected_top_sentence) self.assertEqual(top_mead_score, expected_top_mead_score)
def test_mead_summary_length(self): """ Test length of summary is less than 100 words :return: """ topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() max_length = 100 for topic_id, documents in topics.items(): generator = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) generator.select_content(idf) generator.order_information() realized_content = generator.realize_content() realized_content = [ w for w in realized_content.split(" ") if not " " ] content_length = len(realized_content) self.assertLessEqual(content_length, max_length)
def test_get_centroid_score(self): selector = MeadContentSelector() sent_1 = Sentence("Puppies love playing fetch.", 0) self.args.c_threshold = 'mean' WordMap.word_set = self.w_set WordMap.word_to_id = self.w_map Vectors().create_freq_vectors(self.topics) centroid = selector.get_cluster_centroid(self.doc_list, self.idf, self.args.c_threshold) expected_centroid_score = 6.3 c_score = selector.get_centroid_score(sent_1, centroid) self.assertAlmostEqual(expected_centroid_score, c_score, 1)
def test_term_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0) war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0) puppy_dist = [prob[1] for prob in puppy_topics] enemy_dist = [prob[1] for prob in war_topics] puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1] war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1] self.assertTrue(puppy_war or war_puppy)
def calculate_lda_scores(self, sentences, lda_model): """ Calculate LDA scores for each of the given sentences :param sentences: the given set of sentences :param lda_model: the LDA model :return: list of sentences with lda_scores populated """ for sentence in sentences: sen_tdf = Vectors().create_term_sen_freq(sentence.tokens) lda_scores = lda_model.get_document_topics(sen_tdf, minimum_probability=0) lda_arr = np.zeros(len(lda_scores)) for topic_id, prob in lda_scores: lda_arr[topic_id] = prob sentence.lda_scores = lda_arr return sentences
def test_apply_redundancy_penalty(self): """ Test the function to apply the redundancy penalty :return: """ selector = MeadContentSelector() WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) selected = selector.select_content(self.doc_list, self.args, self.idf) selector.apply_redundancy_penalty(selected[0], selector.selected_content) scores = [s.mead_score for s in selector.selected_content] expected_scores = [ 1.9003829413846463, 1.6243717975775935, 0.6522065176000799, 2.3571461578060453, 1.532600545620478, 1.7661796758000055 ] self.assertEqual(scores, expected_scores)
def load_documents_for_topics(topic_soup): """ Load documents for each topic :param topic_soup: :return: """ topics = {} for topic in topic_soup.find_all('topic'): documents = load_documents(topic) topics[topic['id']] = documents # At this point, all docs have been loaded and all unique words are stored in WordMap set # Need to trigger creation of mapping and of vectors WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) # do we need to have this here if we don't run mead based content selection vec.create_term_doc_freq(topics) return topics
def test_get_topic_matrix(self): # make sure all sentences from all topic docs make it into the matrix topic_one_matrix = Vectors().get_topic_matrix(self.topic_one) expected_num_sentences = 6 self.assertEqual(expected_num_sentences, topic_one_matrix.get_shape()[0])