def build_edmundson(parser, language): summarizer = EdmundsonSummarizer(Stemmer(language)) summarizer.null_words = get_stop_words(language) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words return summarizer
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def test_stemmer(): summarizer_w_stemmer = _build_summarizer(EMPTY_STOP_WORDS, Stemmer('english')) summarizer_wo_stemmer = _build_summarizer(EMPTY_STOP_WORDS) word = Sentence('testing', Tokenizer('english')) assert summarizer_w_stemmer._get_content_words_in_sentence(word) == [ 'test' ] assert summarizer_wo_stemmer._get_content_words_in_sentence(word) == [ 'testing' ]
def test_issue_5_svd_converges(): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") parser = PlaintextParser.from_string( load_resource("articles/svd_converges.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def handle_arguments(args, default_input_stream=sys.stdin): document_format = args['--format'] if document_format is not None and document_format not in PARSERS: raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( ", ".join(PARSERS.keys()), document_format, )) if args["--url"] is not None: parser = PARSERS[document_format or "html"] document_content = fetch_url(args["--url"]) elif args["--file"] is not None: parser = PARSERS[document_format or "plaintext"] with open(args["--file"], "rb") as file: document_content = file.read() elif args["--text"] is not None: parser = PARSERS[document_format or "plaintext"] document_content = args["--text"] else: parser = PARSERS[document_format or "plaintext"] document_content = default_input_stream.read() items_count = ItemsCount(args["--length"]) language = args["--language"] if args["--stopwords"]: stop_words = read_stop_words(args["--stopwords"]) else: stop_words = get_stop_words(language) parser = parser(document_content, Tokenizer(language)) stemmer = Stemmer(language) summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name]) summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser) return summarizer, parser, items_count
def test_empty_document(): document = build_document() summarizer = TextRankSummarizer(Stemmer("english")) returned = summarizer(document, 10) assert len(returned) == 0
def test_slovak_stemmer(): expected = Stemmer("czech") actual = Stemmer("slovak") assert type(actual) is type(expected) assert expected.__dict__ == actual.__dict__
def test_french_stemmer(): french_stemmer = Stemmer('czech') assert "jol" == french_stemmer("jolies")
def test_czech_stemmer(): czech_stemmer = Stemmer('czech') assert "pěkn" == czech_stemmer("pěkný")
def test_german_stemmer(): german_stemmer = Stemmer('german') assert "sterb" == german_stemmer("sterben")
def test_english_stemmer(): english_stemmer = Stemmer('english') assert "beauti" == english_stemmer("beautiful")
def test_missing_stemmer_language(): with pytest.raises(LookupError): Stemmer("klingon")
def build_sum_basic(parser, language): summarizer = SumBasicSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer
def build_lex_rank(parser, language): summarizer = LexRankSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer
def build_lsa(parser, language): summarizer = LsaSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer