def build_edmundson(parser, language): summarizer = EdmundsonSummarizer(Stemmer(language)) summarizer.null_words = get_stop_words(language) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words return summarizer
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def test_real_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) assert len(sentences) == 2
def test_issue_5_svd_converges(): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") parser = PlaintextParser.from_string( load_resource("articles/svd_converges.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def test_document_is_all_in_upper_case(): """ When all words is in upper case Plaintext parser first line as heading and LexRank algorithm raises exception "ZeroDivisionError: float division by zero" because there is no sentence to summarize. See https://github.com/miso-belica/sumy/issues/25 """ parser = PlaintextParser.from_string( "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.", Tokenizer("english") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 0
def test_real_example(): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech")) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) assert list(map(to_unicode, returned)) == [ "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.", "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.", ]
def handle_arguments(args, default_input_stream=sys.stdin): document_format = args['--format'] if document_format is not None and document_format not in PARSERS: raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( ", ".join(PARSERS.keys()), document_format, )) if args["--url"] is not None: parser = PARSERS[document_format or "html"] document_content = fetch_url(args["--url"]) elif args["--file"] is not None: parser = PARSERS[document_format or "plaintext"] with open(args["--file"], "rb") as file: document_content = file.read() elif args["--text"] is not None: parser = PARSERS[document_format or "plaintext"] document_content = args["--text"] else: parser = PARSERS[document_format or "plaintext"] document_content = default_input_stream.read() items_count = ItemsCount(args["--length"]) language = args["--language"] if args["--stopwords"]: stop_words = read_stop_words(args["--stopwords"]) else: stop_words = get_stop_words(language) parser = parser(document_content, Tokenizer(language)) stemmer = Stemmer(language) summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name]) summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser) return summarizer, parser, items_count
def test_missing_stop_words_language(): with pytest.raises(LookupError): get_stop_words("klingon")
def test_ok_stop_words_language(): stop_words = get_stop_words("french") assert len(stop_words) > 1
def build_sum_basic(parser, language): summarizer = SumBasicSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer
def build_lex_rank(parser, language): summarizer = LexRankSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer
def build_lsa(parser, language): summarizer = LsaSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer