def test_corpus_summarization(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Extract the most important documents.
        selected_documents = summarize_corpus(corpus)

        # They are compared to the method reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
            summary = f.read()
            summary = summary.split('\n')

        # Each sentence in the document selection has to be in the model summary.
        for doc_number, document in enumerate(selected_documents):
            # Retrieves all words from the document.
            words = [dictionary[token_id] for (token_id, count) in document]

            # Asserts that all of them are in a sentence from the model reference.
            self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
Beispiel #2
0
    def test_corpus_summarization(self):
        text = self._get_text_from_test_data("mihalcea_tarau.txt")

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [
            dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens
        ]

        # Extract the most important documents.
        selected_documents = summarize_corpus(corpus)

        # They are compared to the method reference.
        summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")
        summary = summary.split('\n')

        # Each sentence in the document selection has to be in the model summary.
        for doc_number, document in enumerate(selected_documents):
            # Retrieves all words from the document.
            words = [dictionary[token_id] for (token_id, count) in document]

            # Asserts that all of them are in a sentence from the model reference.
            self.assertTrue(
                any(all(word in sentence for word in words))
                for sentence in summary)
Beispiel #3
0
    def test_low_distinct_words_corpus_summarization_is_empty_list(self):
        text = self._get_text_from_test_data("testlowdistinctwords.txt")

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertEquals(summarize_corpus(corpus), [])
Beispiel #4
0
    def test_low_distinct_words_corpus_summarization_is_empty_list(self):
        text = self._get_text_from_test_data("testlowdistinctwords.txt")

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertEqual(summarize_corpus(corpus), [])
Beispiel #5
0
    def test_corpus_summarization_is_not_empty_list_on_short_input_text(self):
        text = self._get_text_from_test_data("testsummarization_unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        sentences = text.split('\n')[:8]

        # Generate the corpus.
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertNotEqual(summarize_corpus(corpus), [])
Beispiel #6
0
    def test_corpus_summarization_is_not_empty_list_on_short_input_text(self):
        text = self._get_text_from_test_data("testsummarization_unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        sentences = text.split('\n')[:8]

        # Generate the corpus.
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertNotEqual(summarize_corpus(corpus), [])
Beispiel #7
0
    def test_low_distinct_words_corpus_summarization_is_none(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertTrue(summarize_corpus(corpus) is None)
    def test_corpus_summarization_raises_exception_on_short_input_text(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
            text = f.read()

        # Keeps the first 8 sentences to make the text shorter.
        sentences = text.split('\n')[:8]

        # Generate the corpus.
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertTrue(summarize_corpus(corpus) is not None)
Beispiel #9
0
    def test_corpus_summarization_ratio(self):
        text = self._get_text_from_test_data("mihalcea_tarau.txt")

        # Generate the corpus.
        sentences = text.split('\n')
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Makes summaries of the text using different ratio parameters.
        for x in range(1, 10):
            ratio = x / float(10)
            selected_docs = summarize_corpus(corpus, ratio=ratio)
            expected_summary_length = int(len(corpus) * ratio)

            self.assertEqual(len(selected_docs), expected_summary_length)
Beispiel #10
0
    def test_corpus_summarization_ratio(self):
        text = self._get_text_from_test_data("mihalcea_tarau.txt")

        # Generate the corpus.
        sentences = text.split('\n')
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Makes summaries of the text using different ratio parameters.
        for x in range(1, 10):
            ratio = x / float(10)
            selected_docs = summarize_corpus(corpus, ratio=ratio)
            expected_summary_length = int(len(corpus) * ratio)

            self.assertEqual(len(selected_docs), expected_summary_length)
    def test_corpus_summarization_ratio(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split('\n')
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Makes summaries of the text using different ratio parameters.
        for x in range(1, 10):
            ratio = x / float(10)
            selected_docs = summarize_corpus(corpus, ratio=ratio)
            expected_summary_length = int(len(corpus) * ratio)

            self.assertEqual(len(selected_docs), expected_summary_length)
Beispiel #12
0
    def test_corpus_summarization(self):
        text = self._get_text_from_test_data("mihalcea_tarau.txt")

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Extract the most important documents.
        selected_documents = summarize_corpus(corpus)

        # They are compared to the method reference.
        summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")
        summary = summary.split('\n')

        # Each sentence in the document selection has to be in the model summary.
        for doc_number, document in enumerate(selected_documents):
            # Retrieves all words from the document.
            words = [dictionary[token_id] for (token_id, count) in document]

            # Asserts that all of them are in a sentence from the model reference.
            self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
 def test_empty_corpus_summarization_is_none(self):
     self.assertTrue(summarize_corpus([]) is None)
Beispiel #14
0
 def test_empty_corpus_summarization_is_empty_list(self):
     self.assertEqual(summarize_corpus([]), [])
Beispiel #15
0
 def test_empty_corpus_summarization_is_empty_list(self):
     self.assertEqual(summarize_corpus([]), [])