예제 #1
0
def test_tokenize_sentence_to_words(language, sentence, expected_words):
    tokenizer = Tokenizer(language)

    words = tokenizer.to_words(sentence)

    assert words == expected_words
    assert tokenizer.language == language
예제 #2
0
def test_tokenize_korean_paragraph():
    tokenizer = Tokenizer('korean')
    expected = (
        '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요',
        '다만, 강남 토끼 정이 강남 쉑쉑 버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다',
        '강남 역 맛 집 토끼정의 외부 모습.')

    paragraph = '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.'
    assert expected == tokenizer.to_sentences(paragraph)
예제 #3
0
def test_tokenize_sentences_with_abbreviations():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(
        "There are people who are weird, e.g. normal people. These people know you."
    )

    expected = (
        "There are people who are weird, e.g. normal people.",
        "These people know you.",
    )
    assert expected == sentences
예제 #4
0
def test_tokenize_bangla_paragraph():
    tokenizer = Tokenizer('bangla')
    expected = (
        'মানুষের সুন্দর মুখ দেখে আনন্দিত হয়ো না।',
        'স্বভাবে সে সুন্দর নয়, দেখতে সুন্দর হলেও তার স্বভাব, তার স্পর্শ, তার রীতিনীতিকে মানুষ ঘৃণা করে।',
        'দুঃস্বভাবের মানুষ মানুষের হৃদয়ে জ্বালা ও বেদনা দেয়?',
        'তার সুন্দর মুখে মানুষ তৃপ্তি পায় না!')

    paragraph = 'মানুষের সুন্দর মুখ দেখে আনন্দিত হয়ো না। স্বভাবে সে সুন্দর নয়, দেখতে সুন্দর হলেও তার স্বভাব, তার স্পর্শ, তার রীতিনীতিকে মানুষ ঘৃণা করে। দুঃস্বভাবের মানুষ মানুষের হৃদয়ে জ্বালা ও বেদনা দেয়? তার সুন্দর মুখে মানুষ তৃপ্তি পায় না! '
    #print(tokenizer.to_sentences(paragraph))
    print(tokenizer.to_words(paragraph))
    assert expected == tokenizer.to_sentences(paragraph)
def test_get_all_content_words_in_doc():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)
    s0 = Sentence("One two three.", Tokenizer("english"))
    s1 = Sentence("One two three.", Tokenizer("english"))
    document = build_document([s0, s1])

    content_words = summarizer._get_all_content_words_in_doc(
        document.sentences)
    content_words_freq = {}
    for w in content_words:
        content_words_freq[w] = content_words_freq.get(w, 0) + 1
    content_words_correct = {"one": 2, "two": 2, "three": 2}
    assert content_words_freq == content_words_correct
예제 #6
0
def test_tokenize_paragraph():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences("""
        I am a very nice sentence with comma, but..
        This is next sentence. "I'm bored", said Pepek.
        Ou jee, duffman is here.
    """)

    expected = (
        "I am a very nice sentence with comma, but..",
        "This is next sentence.",
        '"I\'m bored", said Pepek.',
        "Ou jee, duffman is here.",
    )
    assert expected == sentences
예제 #7
0
def test_ensure_czech_tokenizer_available():
    tokenizer = Tokenizer("czech")
    assert "czech" == tokenizer.language

    sentences = tokenizer.to_sentences("""
        Měl jsem sen, že toto je sen. Bylo to také zvláštní.
        Jakoby jsem plaval v moři rekurze.
    """)

    expected = (
        "Měl jsem sen, že toto je sen.",
        "Bylo to také zvláštní.",
        "Jakoby jsem plaval v moři rekurze.",
    )
    assert expected == sentences
예제 #8
0
def test_terms():
    tokenizer = Tokenizer("english")
    text = "wA wB wC wD wB wD wE"
    model = TfDocumentModel(text, tokenizer)

    terms = tuple(sorted(model.terms))
    assert terms == ("wa", "wb", "wc", "wd", "we")
def test_single_sentence(summarizer):
    s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
    document = build_document([s])

    returned = summarizer(document, 10)

    assert len(returned) == 1
예제 #10
0
def test_slovak_alias_into_czech_tokenizer():
    tokenizer = Tokenizer("slovak")
    assert tokenizer.language == "slovak"

    sentences = tokenizer.to_sentences("""
        Je to veľmi fajn. Bodaj by nie.
        Ale na druhej strane čo je to oproti inému?
        To nechám na čitateľa.
    """)

    expected = (
        "Je to veľmi fajn.",
        "Bodaj by nie.",
        "Ale na druhej strane čo je to oproti inému?",
        "To nechám na čitateľa.",
    )
    assert expected == sentences
def test_article_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("articles/prevko_cz_1.txt"),
        Tokenizer("czech")
    )
    summarizer = LexRankSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
예제 #12
0
def test_most_frequent_terms():
    tokenizer = Tokenizer("english")
    text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE"
    model = TfDocumentModel(text, tokenizer)

    assert model.most_frequent_terms(1) == ("we", )
    assert model.most_frequent_terms(2) == ("we", "wd")
    assert model.most_frequent_terms(3) == ("we", "wd", "wc")
    assert model.most_frequent_terms(4) == ("we", "wd", "wc", "wb")
    assert model.most_frequent_terms(5) == ("we", "wd", "wc", "wb", "wa")
    assert model.most_frequent_terms() == ("we", "wd", "wc", "wb", "wa")
def test_real_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("snippets/prevko.txt"),
        Tokenizer("czech")
    )
    summarizer = LsaSummarizer(Stemmer("czech"))
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 2)
    assert len(sentences) == 2
def test_stemmer():
    summarizer_w_stemmer = _build_summarizer(EMPTY_STOP_WORDS,
                                             Stemmer('english'))
    summarizer_wo_stemmer = _build_summarizer(EMPTY_STOP_WORDS)
    word = Sentence('testing', Tokenizer('english'))
    assert summarizer_w_stemmer._get_content_words_in_sentence(word) == [
        'test'
    ]
    assert summarizer_wo_stemmer._get_content_words_in_sentence(word) == [
        'testing'
    ]
예제 #15
0
def test_term_frequency():
    tokenizer = Tokenizer("english")
    text = "wA wB wC wA wA wC wD wCwB"
    model = TfDocumentModel(text, tokenizer)

    assert model.term_frequency("wa") == 3
    assert model.term_frequency("wb") == 1
    assert model.term_frequency("wc") == 2
    assert model.term_frequency("wd") == 1
    assert model.term_frequency("wcwb") == 1
    assert model.term_frequency("we") == 0
    assert model.term_frequency("missing") == 0
def test_compute_tf():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)
    s0 = Sentence("kicking soccer balls.", Tokenizer("english"))
    s1 = Sentence("eating chicken dumplings.", Tokenizer("english"))
    document = build_document([s0, s1])
    freq = summarizer._compute_tf(document.sentences)
    assert freq["kicking"] == 1 / 6
    assert freq["soccer"] == 1 / 6
    assert freq["balls"] == 1 / 6
    assert freq["eating"] == 1 / 6
    assert freq["chicken"] == 1 / 6
    assert freq["dumplings"] == 1 / 6

    document = build_document([s0, s0, s1])
    freq = summarizer._compute_tf(document.sentences)
    assert freq["kicking"] == 2 / 9
    assert freq["soccer"] == 2 / 9
    assert freq["balls"] == 2 / 9
    assert freq["eating"] == 1 / 9
    assert freq["chicken"] == 1 / 9
    assert freq["dumplings"] == 1 / 9
def test_compute_ratings():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)

    s0 = Sentence("Dog cat fish.", Tokenizer("english"))
    s1 = Sentence("Dog cat camel.", Tokenizer("english"))
    s2 = Sentence("Fish frog horse.", Tokenizer("english"))
    document = build_document([s0, s1, s2])

    ratings = summarizer._compute_ratings(document.sentences)
    assert ratings[s0] == 0
    assert ratings[s1] == -2
    assert ratings[s2] == -1

    # Due to the frequency discounting, after finding sentence s0,
    # s2 should come before s1 since only two of its words get discounted
    # rather than all 3 of s1's
    s0 = Sentence("one two three", Tokenizer("english"))
    s1 = Sentence("one two four", Tokenizer("english"))
    s2 = Sentence("three five six", Tokenizer("english"))
    document = build_document([s0, s1, s2])

    ratings = summarizer._compute_ratings(document.sentences)
    assert ratings[s0] == 0
    assert ratings[s1] == -2
    assert ratings[s2] == -1
def test_issue_5_svd_converges():
    """Source: https://github.com/miso-belica/sumy/issues/5"""
    pytest.skip("Can't reproduce the issue.")

    parser = PlaintextParser.from_string(
        load_resource("articles/svd_converges.txt"),
        Tokenizer("english")
    )
    summarizer = LsaSummarizer(Stemmer("english"))
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
예제 #19
0
def test_tf_idf_metric_should_be_real_number():
    """https://github.com/miso-belica/sumy/issues/41"""
    summarizer = KLSummarizer()
    frequencies = summarizer.compute_tf(
        [Sentence("There are five words, jop.", Tokenizer("english"))])

    assert frequencies == {
        "there": 0.2,
        "are": 0.2,
        "five": 0.2,
        "words": 0.2,
        "jop": 0.2,
    }
def test_document_is_all_in_upper_case():
    """
    When all words is in upper case Plaintext parser first line as heading and
    LexRank algorithm raises exception "ZeroDivisionError: float division by zero"
    because there is no sentence to summarize.
    See https://github.com/miso-belica/sumy/issues/25
    """
    parser = PlaintextParser.from_string(
        "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.",
        Tokenizer("english")
    )
    summarizer = LexRankSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 0
def test_real_example():
    parser = PlaintextParser.from_string(
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
        "Přerostly až v reparát z jazyka na konci školního roku. "
        "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
        "o rok mladších dětí budoval vedoucí pozici. "
        "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
        Tokenizer("czech"))
    summarizer = LuhnSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    returned = summarizer(parser.document, 2)
    assert list(map(to_unicode, returned)) == [
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.",
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.",
    ]
예제 #22
0
def main(args=None):
    args = docopt(to_string(__doc__), args, version=__version__)
    summarizer, document, items_count, reference_summary = handle_arguments(
        args)

    evaluated_sentences = summarizer(document, items_count)
    reference_document = PlaintextParser.from_string(
        reference_summary, Tokenizer(args["--language"]))
    reference_sentences = reference_document.document.sentences

    for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS:
        if evaluate_document:
            result = evaluate(evaluated_sentences, document.sentences)
        else:
            result = evaluate(evaluated_sentences, reference_sentences)
        print("%s: %f" % (name, result))

    return 0
def test_parse_plaintext():
    parser = PlaintextParser.from_string(
        """
        Ako sa máš? Ja dobre! A ty? No
        mohlo to byť aj lepšie!!! Ale pohodička.


        TOTO JE AKOŽE NADPIS
        A toto je text pod ním, ktorý je textový.
        A tak ďalej...
    """, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 0
    assert len(document.paragraphs[0].sentences) == 5

    assert len(document.paragraphs[1].headings) == 1
    assert len(document.paragraphs[1].sentences) == 2
def test_parse_plaintext_long():
    parser = PlaintextParser.from_string(
        """
        Ako sa máš? Ja dobre! A ty? No
        mohlo to byť aj lepšie!!! Ale pohodička.

        TOTO JE AKOŽE NADPIS
        A toto je text pod ním, ktorý je textový.
        A tak ďalej...

        VEĽKOLEPÉ PREKVAPENIE
        Tretí odstavec v tomto texte je úplne o ničom. Ale má
        vety a to je hlavné. Takže sa majte na pozore ;-)

        A tak ďalej...


        A tak este dalej!
    """, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 5

    assert len(document.paragraphs[0].headings) == 0
    assert len(document.paragraphs[0].sentences) == 5

    assert len(document.paragraphs[1].headings) == 1
    assert len(document.paragraphs[1].sentences) == 2

    assert len(document.paragraphs[2].headings) == 1
    assert len(document.paragraphs[2].sentences) == 3

    assert len(document.paragraphs[3].headings) == 0
    assert len(document.paragraphs[3].sentences) == 1

    assert len(document.paragraphs[4].headings) == 0
    assert len(document.paragraphs[4].sentences) == 1
예제 #25
0
def handle_arguments(args, default_input_stream=sys.stdin):
    document_format = args['--format']
    if document_format is not None and document_format not in PARSERS:
        raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % (
            ", ".join(PARSERS.keys()),
            document_format,
        ))

    if args["--url"] is not None:
        parser = PARSERS[document_format or "html"]
        document_content = fetch_url(args["--url"])
    elif args["--file"] is not None:
        parser = PARSERS[document_format or "plaintext"]
        with open(args["--file"], "rb") as file:
            document_content = file.read()
    elif args["--text"] is not None:
        parser = PARSERS[document_format or "plaintext"]
        document_content = args["--text"]
    else:
        parser = PARSERS[document_format or "plaintext"]
        document_content = default_input_stream.read()

    items_count = ItemsCount(args["--length"])

    language = args["--language"]
    if args["--stopwords"]:
        stop_words = read_stop_words(args["--stopwords"])
    else:
        stop_words = get_stop_words(language)

    parser = parser(document_content, Tokenizer(language))
    stemmer = Stemmer(language)

    summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name])
    summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser)

    return summarizer, parser, items_count
예제 #26
0
def handle_arguments(args):
    document_format = args["--format"]
    if document_format is not None and document_format not in PARSERS:
        raise ValueError(
            "Unsupported format of input document. Possible values are: %s. Given: %s."
            % (
                ", ".join(PARSERS.keys()),
                document_format,
            ))

    if args["--url"] is not None:
        parser = PARSERS["html"]
        document_content = fetch_url(args["--url"])
    elif args["--file"] is not None:
        parser = PARSERS.get(document_format, PlaintextParser)
        with open(args["--file"], "rb") as file:
            document_content = file.read()
    else:
        parser = PARSERS["plaintext"]
        document_content = sys.stdin.read()

    summarizer_builder = AVAILABLE_METHODS["luhn"]
    for method, builder in AVAILABLE_METHODS.items():
        if args[method]:
            summarizer_builder = builder
            break

    items_count = ItemsCount(args["--length"])

    parser = parser(document_content, Tokenizer(args["--language"]))

    with open(args["<reference_summary>"], "rb") as file:
        reference_summmary = file.read().decode("utf-8")

    return summarizer_builder(
        parser,
        args["--language"]), parser.document, items_count, reference_summmary
예제 #27
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

from os.path import dirname, join, abspath
from Python.baselineAlgorithm.sumy.nlp.tokenizers import Tokenizer
from Python.baselineAlgorithm.sumy._compat import to_string, to_unicode
from Python.baselineAlgorithm.sumy.models.dom import ObjectDocumentModel, Paragraph, Sentence

_TOKENIZER = Tokenizer("czech")


def expand_resource_path(path):
    return join(abspath(dirname(__file__)), to_string("data"), to_string(path))


def load_resource(path):
    path = expand_resource_path(path)
    with open(path, "rb") as file:
        return to_unicode(file.read())


def build_document(*sets_of_sentences):
    paragraphs = []
    for sentences in sets_of_sentences:
def test_sentences_with_same_words_in_different_order_are_different():
    sentence1 = Sentence("word another", Tokenizer("czech"))
    sentence2 = Sentence("another word", Tokenizer("czech"))

    assert sentence1 != sentence2
def test_same_sentences_equal():
    sentence1 = Sentence("word another.", Tokenizer("czech"))
    sentence2 = Sentence("word another.", Tokenizer("czech"))

    assert sentence1 == sentence2
def test_empty_sentences_equal():
    sentence1 = Sentence("", Tokenizer("czech"))
    sentence2 = Sentence("", Tokenizer("czech"))

    assert sentence1 == sentence2