def summarizeText(self, text, lines): length = len(text) text_div1 = text[0:int(length / 2)] text_div2 = text[int(length / 2):int(length - 1)] textrank_div1 = TextRank(text_div1) textrank_div2 = TextRank(text_div2) textresult_div1 = textrank_div1.summarize(10) textresult_div2 = textrank_div2.summarize(10) textresult = textresult_div1 + textresult_div2 resultrank = TextRank(textresult) return resultrank.summarize(lines)
def summarizeTextList(self, textList, lines): length = len(textList) textList1 = textList[0:int(length / 2)] textList2 = textList[int(length / 2):int(length - 1)] text1 = '' text2 = '' for sentence in textList1: text1 += sentence + ' ' for sentence in textList2: text2 += sentence + ' ' textrank1 = TextRank(text1) textrank2 = TextRank(text2) textresult1 = textrank1.summarize(10) textresult2 = textrank2.summarize(10) textresult = textresult1 + textresult2 resultrank = TextRank(textresult) return resultrank.summarize(lines)
class TestTextRank(unittest.TestCase): def setUp(self): self.text_rank = TextRank() def test_process_html(self): article_html = utils.get_article_contents("article1.html") expected_article_text = utils.get_article_contents("article1.txt") article = self.text_rank.process_html(article_html) self.assertEqual( "Poll finds Raptors’ playoff run has attracted new fans across Canada", article.title, ) self.assertEqual(expected_article_text, article.text) self.assertEqual("en", article.config.get_language()) def test_summarize_from_html(self): article_html = utils.get_article_contents("article2.html") summary = self.text_rank.summarize_from_html(article_html, 15) self.assertTrue(summary) def test_evaluate_newspaper_summary_deterministic(self): article = utils.get_article_contents("article2.txt") sentences = tokenize.sent_tokenize(article) scores = self.text_rank.evaluate_newspaper_summary( "What's inside the Barcode?", article, sentences, "en") ranked_sentences = sorted(((v, k[1]) for k, v in scores.items()), reverse=True) top_sentences = list(score_sentence_tuple[1] for score_sentence_tuple in ranked_sentences[:3]) self.assertListEqual( [ "If the Scanner doesn’t find it, it will not acknowledge the EAN13 barcode.", "In this article, we’re gonna take an example of the EAN13 barcode.", "What’s inside the Barcode?", ], top_sentences, ) def test_evaluate_newspaper_summary_returns_normalized_scores(self): article = utils.get_article_contents("article2.txt") sentences = tokenize.sent_tokenize(article) scores = self.text_rank.evaluate_newspaper_summary( "What's inside the Barcode?", article, sentences, "en") score_sum = sum(scores.values()) self.assertEqual(1, score_sum) def test_evaluate_textrank_summary_returns_normalized_scores(self): # evaluate_textrank_summary depends heavily on word vectorizations # which are impractical to load on every test run, so this is all we can do article = utils.get_article_contents("article1.txt") sentences = tokenize.sent_tokenize(article) scores = self.text_rank.evaluate_textrank_summary(sentences) score_sum = sum(scores.values()) self.assertEqual(1, score_sum) def test_summarize_returns_15_percent_of_sentences(self): article = utils.get_article_contents("article1.txt") sentences = tokenize.sent_tokenize(article) all_top_sentences = self.text_rank.summarize("test title", article, "en", 100) top_15p_sentences = self.text_rank.summarize("test title", article, "en", 15) self.assertEqual(len(sentences), len(all_top_sentences)) self.assertEqual(math.ceil(len(all_top_sentences) * 15 / 100), len(top_15p_sentences)) def test_summarize_one_sentence(self): summary = self.text_rank.summarize("Hello world!", "Hello world!", "en", 100) self.assertListEqual([], summary) def test_summarize_default_language(self): summary = self.text_rank.summarize("Hello world!", "Hello world! Welcome.", None, 100) self.assertListEqual(["Welcome."], summary)