def test_tokenize_sentence_to_words(language, sentence, expected_words): tokenizer = Tokenizer(language) words = tokenizer.to_words(sentence) assert words == expected_words assert tokenizer.language == language
def test_tokenize_korean_paragraph(): tokenizer = Tokenizer('korean') expected = ( '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요', '다만, 강남 토끼 정이 강남 쉑쉑 버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다', '강남 역 맛 집 토끼정의 외부 모습.') paragraph = '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.' assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_sentences_with_abbreviations(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences( "There are people who are weird, e.g. normal people. These people know you." ) expected = ( "There are people who are weird, e.g. normal people.", "These people know you.", ) assert expected == sentences
def test_tokenize_bangla_paragraph(): tokenizer = Tokenizer('bangla') expected = ( 'মানুষের সুন্দর মুখ দেখে আনন্দিত হয়ো না।', 'স্বভাবে সে সুন্দর নয়, দেখতে সুন্দর হলেও তার স্বভাব, তার স্পর্শ, তার রীতিনীতিকে মানুষ ঘৃণা করে।', 'দুঃস্বভাবের মানুষ মানুষের হৃদয়ে জ্বালা ও বেদনা দেয়?', 'তার সুন্দর মুখে মানুষ তৃপ্তি পায় না!') paragraph = 'মানুষের সুন্দর মুখ দেখে আনন্দিত হয়ো না। স্বভাবে সে সুন্দর নয়, দেখতে সুন্দর হলেও তার স্বভাব, তার স্পর্শ, তার রীতিনীতিকে মানুষ ঘৃণা করে। দুঃস্বভাবের মানুষ মানুষের হৃদয়ে জ্বালা ও বেদনা দেয়? তার সুন্দর মুখে মানুষ তৃপ্তি পায় না! ' #print(tokenizer.to_sentences(paragraph)) print(tokenizer.to_words(paragraph)) assert expected == tokenizer.to_sentences(paragraph)
def test_get_all_content_words_in_doc(): summarizer = _build_summarizer(EMPTY_STOP_WORDS) s0 = Sentence("One two three.", Tokenizer("english")) s1 = Sentence("One two three.", Tokenizer("english")) document = build_document([s0, s1]) content_words = summarizer._get_all_content_words_in_doc( document.sentences) content_words_freq = {} for w in content_words: content_words_freq[w] = content_words_freq.get(w, 0) + 1 content_words_correct = {"one": 2, "two": 2, "three": 2} assert content_words_freq == content_words_correct
def test_tokenize_paragraph(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences(""" I am a very nice sentence with comma, but.. This is next sentence. "I'm bored", said Pepek. Ou jee, duffman is here. """) expected = ( "I am a very nice sentence with comma, but..", "This is next sentence.", '"I\'m bored", said Pepek.', "Ou jee, duffman is here.", ) assert expected == sentences
def test_ensure_czech_tokenizer_available(): tokenizer = Tokenizer("czech") assert "czech" == tokenizer.language sentences = tokenizer.to_sentences(""" Měl jsem sen, že toto je sen. Bylo to také zvláštní. Jakoby jsem plaval v moři rekurze. """) expected = ( "Měl jsem sen, že toto je sen.", "Bylo to také zvláštní.", "Jakoby jsem plaval v moři rekurze.", ) assert expected == sentences
def test_terms(): tokenizer = Tokenizer("english") text = "wA wB wC wD wB wD wE" model = TfDocumentModel(text, tokenizer) terms = tuple(sorted(model.terms)) assert terms == ("wa", "wb", "wc", "wd", "we")
def test_single_sentence(summarizer): s = Sentence("I am one slightly longer sentence.", Tokenizer("english")) document = build_document([s]) returned = summarizer(document, 10) assert len(returned) == 1
def test_slovak_alias_into_czech_tokenizer(): tokenizer = Tokenizer("slovak") assert tokenizer.language == "slovak" sentences = tokenizer.to_sentences(""" Je to veľmi fajn. Bodaj by nie. Ale na druhej strane čo je to oproti inému? To nechám na čitateľa. """) expected = ( "Je to veľmi fajn.", "Bodaj by nie.", "Ale na druhej strane čo je to oproti inému?", "To nechám na čitateľa.", ) assert expected == sentences
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def test_most_frequent_terms(): tokenizer = Tokenizer("english") text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE" model = TfDocumentModel(text, tokenizer) assert model.most_frequent_terms(1) == ("we", ) assert model.most_frequent_terms(2) == ("we", "wd") assert model.most_frequent_terms(3) == ("we", "wd", "wc") assert model.most_frequent_terms(4) == ("we", "wd", "wc", "wb") assert model.most_frequent_terms(5) == ("we", "wd", "wc", "wb", "wa") assert model.most_frequent_terms() == ("we", "wd", "wc", "wb", "wa")
def test_real_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) assert len(sentences) == 2
def test_stemmer(): summarizer_w_stemmer = _build_summarizer(EMPTY_STOP_WORDS, Stemmer('english')) summarizer_wo_stemmer = _build_summarizer(EMPTY_STOP_WORDS) word = Sentence('testing', Tokenizer('english')) assert summarizer_w_stemmer._get_content_words_in_sentence(word) == [ 'test' ] assert summarizer_wo_stemmer._get_content_words_in_sentence(word) == [ 'testing' ]
def test_term_frequency(): tokenizer = Tokenizer("english") text = "wA wB wC wA wA wC wD wCwB" model = TfDocumentModel(text, tokenizer) assert model.term_frequency("wa") == 3 assert model.term_frequency("wb") == 1 assert model.term_frequency("wc") == 2 assert model.term_frequency("wd") == 1 assert model.term_frequency("wcwb") == 1 assert model.term_frequency("we") == 0 assert model.term_frequency("missing") == 0
def test_compute_tf(): summarizer = _build_summarizer(EMPTY_STOP_WORDS) s0 = Sentence("kicking soccer balls.", Tokenizer("english")) s1 = Sentence("eating chicken dumplings.", Tokenizer("english")) document = build_document([s0, s1]) freq = summarizer._compute_tf(document.sentences) assert freq["kicking"] == 1 / 6 assert freq["soccer"] == 1 / 6 assert freq["balls"] == 1 / 6 assert freq["eating"] == 1 / 6 assert freq["chicken"] == 1 / 6 assert freq["dumplings"] == 1 / 6 document = build_document([s0, s0, s1]) freq = summarizer._compute_tf(document.sentences) assert freq["kicking"] == 2 / 9 assert freq["soccer"] == 2 / 9 assert freq["balls"] == 2 / 9 assert freq["eating"] == 1 / 9 assert freq["chicken"] == 1 / 9 assert freq["dumplings"] == 1 / 9
def test_compute_ratings(): summarizer = _build_summarizer(EMPTY_STOP_WORDS) s0 = Sentence("Dog cat fish.", Tokenizer("english")) s1 = Sentence("Dog cat camel.", Tokenizer("english")) s2 = Sentence("Fish frog horse.", Tokenizer("english")) document = build_document([s0, s1, s2]) ratings = summarizer._compute_ratings(document.sentences) assert ratings[s0] == 0 assert ratings[s1] == -2 assert ratings[s2] == -1 # Due to the frequency discounting, after finding sentence s0, # s2 should come before s1 since only two of its words get discounted # rather than all 3 of s1's s0 = Sentence("one two three", Tokenizer("english")) s1 = Sentence("one two four", Tokenizer("english")) s2 = Sentence("three five six", Tokenizer("english")) document = build_document([s0, s1, s2]) ratings = summarizer._compute_ratings(document.sentences) assert ratings[s0] == 0 assert ratings[s1] == -2 assert ratings[s2] == -1
def test_issue_5_svd_converges(): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") parser = PlaintextParser.from_string( load_resource("articles/svd_converges.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def test_tf_idf_metric_should_be_real_number(): """https://github.com/miso-belica/sumy/issues/41""" summarizer = KLSummarizer() frequencies = summarizer.compute_tf( [Sentence("There are five words, jop.", Tokenizer("english"))]) assert frequencies == { "there": 0.2, "are": 0.2, "five": 0.2, "words": 0.2, "jop": 0.2, }
def test_document_is_all_in_upper_case(): """ When all words is in upper case Plaintext parser first line as heading and LexRank algorithm raises exception "ZeroDivisionError: float division by zero" because there is no sentence to summarize. See https://github.com/miso-belica/sumy/issues/25 """ parser = PlaintextParser.from_string( "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.", Tokenizer("english") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 0
def test_real_example(): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech")) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) assert list(map(to_unicode, returned)) == [ "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.", "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.", ]
def main(args=None): args = docopt(to_string(__doc__), args, version=__version__) summarizer, document, items_count, reference_summary = handle_arguments( args) evaluated_sentences = summarizer(document, items_count) reference_document = PlaintextParser.from_string( reference_summary, Tokenizer(args["--language"])) reference_sentences = reference_document.document.sentences for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS: if evaluate_document: result = evaluate(evaluated_sentences, document.sentences) else: result = evaluate(evaluated_sentences, reference_sentences) print("%s: %f" % (name, result)) return 0
def test_parse_plaintext(): parser = PlaintextParser.from_string( """ Ako sa máš? Ja dobre! A ty? No mohlo to byť aj lepšie!!! Ale pohodička. TOTO JE AKOŽE NADPIS A toto je text pod ním, ktorý je textový. A tak ďalej... """, Tokenizer("czech")) document = parser.document assert len(document.paragraphs) == 2 assert len(document.paragraphs[0].headings) == 0 assert len(document.paragraphs[0].sentences) == 5 assert len(document.paragraphs[1].headings) == 1 assert len(document.paragraphs[1].sentences) == 2
def test_parse_plaintext_long(): parser = PlaintextParser.from_string( """ Ako sa máš? Ja dobre! A ty? No mohlo to byť aj lepšie!!! Ale pohodička. TOTO JE AKOŽE NADPIS A toto je text pod ním, ktorý je textový. A tak ďalej... VEĽKOLEPÉ PREKVAPENIE Tretí odstavec v tomto texte je úplne o ničom. Ale má vety a to je hlavné. Takže sa majte na pozore ;-) A tak ďalej... A tak este dalej! """, Tokenizer("czech")) document = parser.document assert len(document.paragraphs) == 5 assert len(document.paragraphs[0].headings) == 0 assert len(document.paragraphs[0].sentences) == 5 assert len(document.paragraphs[1].headings) == 1 assert len(document.paragraphs[1].sentences) == 2 assert len(document.paragraphs[2].headings) == 1 assert len(document.paragraphs[2].sentences) == 3 assert len(document.paragraphs[3].headings) == 0 assert len(document.paragraphs[3].sentences) == 1 assert len(document.paragraphs[4].headings) == 0 assert len(document.paragraphs[4].sentences) == 1
def handle_arguments(args, default_input_stream=sys.stdin): document_format = args['--format'] if document_format is not None and document_format not in PARSERS: raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( ", ".join(PARSERS.keys()), document_format, )) if args["--url"] is not None: parser = PARSERS[document_format or "html"] document_content = fetch_url(args["--url"]) elif args["--file"] is not None: parser = PARSERS[document_format or "plaintext"] with open(args["--file"], "rb") as file: document_content = file.read() elif args["--text"] is not None: parser = PARSERS[document_format or "plaintext"] document_content = args["--text"] else: parser = PARSERS[document_format or "plaintext"] document_content = default_input_stream.read() items_count = ItemsCount(args["--length"]) language = args["--language"] if args["--stopwords"]: stop_words = read_stop_words(args["--stopwords"]) else: stop_words = get_stop_words(language) parser = parser(document_content, Tokenizer(language)) stemmer = Stemmer(language) summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name]) summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser) return summarizer, parser, items_count
def handle_arguments(args): document_format = args["--format"] if document_format is not None and document_format not in PARSERS: raise ValueError( "Unsupported format of input document. Possible values are: %s. Given: %s." % ( ", ".join(PARSERS.keys()), document_format, )) if args["--url"] is not None: parser = PARSERS["html"] document_content = fetch_url(args["--url"]) elif args["--file"] is not None: parser = PARSERS.get(document_format, PlaintextParser) with open(args["--file"], "rb") as file: document_content = file.read() else: parser = PARSERS["plaintext"] document_content = sys.stdin.read() summarizer_builder = AVAILABLE_METHODS["luhn"] for method, builder in AVAILABLE_METHODS.items(): if args[method]: summarizer_builder = builder break items_count = ItemsCount(args["--length"]) parser = parser(document_content, Tokenizer(args["--language"])) with open(args["<reference_summary>"], "rb") as file: reference_summmary = file.read().decode("utf-8") return summarizer_builder( parser, args["--language"]), parser.document, items_count, reference_summmary
# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division, print_function, unicode_literals try: from StringIO import StringIO except ImportError: from io import StringIO from os.path import dirname, join, abspath from Python.baselineAlgorithm.sumy.nlp.tokenizers import Tokenizer from Python.baselineAlgorithm.sumy._compat import to_string, to_unicode from Python.baselineAlgorithm.sumy.models.dom import ObjectDocumentModel, Paragraph, Sentence _TOKENIZER = Tokenizer("czech") def expand_resource_path(path): return join(abspath(dirname(__file__)), to_string("data"), to_string(path)) def load_resource(path): path = expand_resource_path(path) with open(path, "rb") as file: return to_unicode(file.read()) def build_document(*sets_of_sentences): paragraphs = [] for sentences in sets_of_sentences:
def test_sentences_with_same_words_in_different_order_are_different(): sentence1 = Sentence("word another", Tokenizer("czech")) sentence2 = Sentence("another word", Tokenizer("czech")) assert sentence1 != sentence2
def test_same_sentences_equal(): sentence1 = Sentence("word another.", Tokenizer("czech")) sentence2 = Sentence("word another.", Tokenizer("czech")) assert sentence1 == sentence2
def test_empty_sentences_equal(): sentence1 = Sentence("", Tokenizer("czech")) sentence2 = Sentence("", Tokenizer("czech")) assert sentence1 == sentence2