def test_corpus_summarization(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Extract the most important documents. selected_documents = summarize_corpus(corpus) # They are compared to the method reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f: summary = f.read() summary = summary.split('\n') # Each sentence in the document selection has to be in the model summary. for doc_number, document in enumerate(selected_documents): # Retrieves all words from the document. words = [dictionary[token_id] for (token_id, count) in document] # Asserts that all of them are in a sentence from the model reference. self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
def test_corpus_summarization(self): text = self._get_text_from_test_data("mihalcea_tarau.txt") # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [ dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens ] # Extract the most important documents. selected_documents = summarize_corpus(corpus) # They are compared to the method reference. summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt") summary = summary.split('\n') # Each sentence in the document selection has to be in the model summary. for doc_number, document in enumerate(selected_documents): # Retrieves all words from the document. words = [dictionary[token_id] for (token_id, count) in document] # Asserts that all of them are in a sentence from the model reference. self.assertTrue( any(all(word in sentence for word in words)) for sentence in summary)
def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertEquals(summarize_corpus(corpus), [])
def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertEqual(summarize_corpus(corpus), [])
def test_corpus_summarization_is_not_empty_list_on_short_input_text(self): text = self._get_text_from_test_data("testsummarization_unrelated.txt") # Keeps the first 8 sentences to make the text shorter. sentences = text.split('\n')[:8] # Generate the corpus. tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertNotEqual(summarize_corpus(corpus), [])
def test_low_distinct_words_corpus_summarization_is_none(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertTrue(summarize_corpus(corpus) is None)
def test_corpus_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: text = f.read() # Keeps the first 8 sentences to make the text shorter. sentences = text.split('\n')[:8] # Generate the corpus. tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertTrue(summarize_corpus(corpus) is not None)
def test_corpus_summarization_ratio(self): text = self._get_text_from_test_data("mihalcea_tarau.txt") # Generate the corpus. sentences = text.split('\n') tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Makes summaries of the text using different ratio parameters. for x in range(1, 10): ratio = x / float(10) selected_docs = summarize_corpus(corpus, ratio=ratio) expected_summary_length = int(len(corpus) * ratio) self.assertEqual(len(selected_docs), expected_summary_length)
def test_corpus_summarization_ratio(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split('\n') tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Makes summaries of the text using different ratio parameters. for x in range(1, 10): ratio = x / float(10) selected_docs = summarize_corpus(corpus, ratio=ratio) expected_summary_length = int(len(corpus) * ratio) self.assertEqual(len(selected_docs), expected_summary_length)
def test_corpus_summarization(self): text = self._get_text_from_test_data("mihalcea_tarau.txt") # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Extract the most important documents. selected_documents = summarize_corpus(corpus) # They are compared to the method reference. summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt") summary = summary.split('\n') # Each sentence in the document selection has to be in the model summary. for doc_number, document in enumerate(selected_documents): # Retrieves all words from the document. words = [dictionary[token_id] for (token_id, count) in document] # Asserts that all of them are in a sentence from the model reference. self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
def test_empty_corpus_summarization_is_none(self): self.assertTrue(summarize_corpus([]) is None)
def test_empty_corpus_summarization_is_empty_list(self): self.assertEqual(summarize_corpus([]), [])