def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
                         "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
                         "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)

        self.assertEqual(
            to_unicode(document.paragraphs[1].sentences[0]),
            "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.")
        self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]),
                         "Aj súbory majú predsa city.")
Beispiel #2
0
    def test_cue_2(self):
        document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ),
                                  ("Pepek likes spinach", ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
Beispiel #3
0
    def test_location_method_2(self):
        document = build_document_from_string("""
            # na nb nc ha hb
            ha = 1 + 1 + 0 = 2
            middle = 0
            ha hb = 2 + 1 + 0 = 3

            first = 1
            ha hb ha = 3
            last = 1

            # hc hd
            hb hc hd = 3 + 1 + 0 = 4
            ha hb = 2 + 1 + 0 = 3
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "na",
            "nb",
            "nc",
            "nd",
            "ne",
        )

        sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0)
        self.assertEqual(len(sentences), 4)
        self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3")
        self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3")
        self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4")
        self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")
Beispiel #4
0
    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1,
                                         key_weight=1,
                                         title_weight=0,
                                         location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words",
                                  "like", "because")
        summarizer.stigma_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(
            to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
Beispiel #5
0
    def test_title_method_3(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer.title_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(to_unicode(sentences[1]),
                         "This is next paragraph because of blank line above")
        self.assertEqual(
            to_unicode(sentences[2]),
            "Here is the winner because contains words like cool and heading")
Beispiel #6
0
    def test_cue_3(self):
        document = build_document(
            (
                "ba "*10,
                "bb "*10,
                " sa"*8 + " bb"*10,
                "bb bc ba",
            ),
            (),
            (
                "babbbc "*10,
                "na nb nc nd sa" + " bc"*10,
                " ba n"*10,
            )
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
            "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip())
Beispiel #7
0
    def test_cue_3(self):
        document = build_document((
            "ba " * 10,
            "bb " * 10,
            " sa" * 8 + " bb" * 10,
            "bb bc ba",
        ), (), (
            "babbbc " * 10,
            "na nb nc nd sa" + " bc" * 10,
            " ba n" * 10,
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba " * 10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb " * 10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
                         "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n " * 10).strip())
Beispiel #8
0
    def test_document(self):
        document = build_document((
            "I am the sentence you like",
            "Do you like me too",
        ), (
            "This sentence is better than that above",
            "Are you kidding me",
        ))
        summarizer = LsaSummarizer()
        summarizer.stopwords = (
            "I",
            "am",
            "the",
            "you",
            "are",
            "me",
            "is",
            "than",
            "that",
            "this",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]),
                         "This sentence is better than that above")
Beispiel #9
0
    def test_location_method_2(self):
        document = build_document_from_string("""
            # na nb nc ha hb
            ha = 1 + 1 + 0 = 2
            middle = 0
            ha hb = 2 + 1 + 0 = 3

            first = 1
            ha hb ha = 3
            last = 1

            # hc hd
            hb hc hd = 3 + 1 + 0 = 4
            ha hb = 2 + 1 + 0 = 3
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("na", "nb", "nc", "nd", "ne",)

        sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0)
        self.assertEqual(len(sentences), 4)
        self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3")
        self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3")
        self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4")
        self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")
Beispiel #10
0
    def test_two_sentences(self):
        document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem", "a", "ta",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
Beispiel #11
0
    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = GraphSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
Beispiel #12
0
def test_two_sentences():
    document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = ("I", "am", "and", "that",)

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"
    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
Beispiel #14
0
    def test_document(self):
        document = build_document(
            ("I am the sentence you like", "Do you like me too",),
            ("This sentence is better than that above", "Are you kidding me",)
        )
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]), "This sentence is better than that above")
Beispiel #15
0
    def test_key_2(self):
        document = build_document(
            ("Om nom nom nom nom", "Sure I summarize it, with bonus",),
            ("This is bonus test sentence with some extra words and bonus",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("nom", "bonus",)

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")
Beispiel #16
0
    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")
Beispiel #17
0
    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")
Beispiel #18
0
def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."
Beispiel #19
0
    def test_title_method_without_title(self):
        document = build_document(
            ("This is sentence", "This is another one",),
            ("And some next sentence but no heading",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("this", "is", "some", "and",)

        sentences = summarizer.title_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "This is sentence")
        self.assertEqual(to_unicode(sentences[1]), "This is another one")
        self.assertEqual(to_unicode(sentences[2]), "And some next sentence but no heading")
Beispiel #20
0
def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."
Beispiel #21
0
    def test_key_no_bonus_words_in_document(self):
        document = build_document(
            ("wa wb wc wd", "I like music",),
            ("This is test sentence with some extra words",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc", "bonus",)

        sentences = summarizer.key_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd")
        self.assertEqual(to_unicode(sentences[1]), "I like music")
        self.assertEqual(to_unicode(sentences[2]),
            "This is test sentence with some extra words")
Beispiel #22
0
    def test_cue_letters_case(self):
        document = build_document(
            ("X X X", "x x x x",),
            ("w w w", "W W W W",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("X", "w",)
        summarizer.stigma_words = ("stigma",)

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")
Beispiel #23
0
    def test_two_sentences(self):
        document = build_document(
            ("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = (
            "já",
            "jsem",
            "a",
            "ta",
        )

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
Beispiel #24
0
    def test_sentences(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.sentences), 3)
        self.assertEqual(to_unicode(document.sentences[0]),
            "Nějaký muž šel kolem naší zahrady")
        self.assertEqual(to_unicode(document.sentences[1]),
            "Nějaký jiný muž šel kolem vaší zahrady")
        self.assertEqual(to_unicode(document.sentences[2]),
            "Už už abych taky šel")
Beispiel #25
0
    def test_sentences(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.sentences), 3)
        self.assertEqual(to_unicode(document.sentences[0]),
                         "Nějaký muž šel kolem naší zahrady")
        self.assertEqual(to_unicode(document.sentences[1]),
                         "Nějaký jiný muž šel kolem vaší zahrady")
        self.assertEqual(to_unicode(document.sentences[2]),
                         "Už už abych taky šel")
def create_list():

    global page_title, l, title_tokens, net_graph, graph_list, q, badtags

    addonv2.path(pdfname)
    page_title, l, title_tokens = bookmark_page_v4.bkpage(pdfname)
    badtags = [
        'cover', 'notes and further reading'
        'title page', 'copyright page', 'contents', 'new to the third edition',
        'review questions', 'laboratory exercises',
        'epilogue: algorithms that run forever', 'brief contents',
        'about the authors', 'exercises', 'solved exercises',
        'about the author', 'preface', 'selected bibliography',
        'acknowledgments', 'references', 'index', 'brief contents', 'foreword',
        'bibliography', 'table of contents', 'foreword', 'appendix',
        'epilogue', 'about the cd'
    ]

    net_graph = nx.DiGraph()
    graph_list = list()

    for i in range(0, len(badtags)):
        badtags[i].encode('utf-8')
    q = list()
    k = -1

    for i in range(0, len(addonv2.l)):
        flag = 0
        for j in range(0, len(l)):

            if addonv2.l[i][1] == l[j] and addonv2.l[i][0] != '3':
                k += 1
                t = list()
                t.extend([
                    addonv2.l[i][0], addonv2.l[i][1],
                    page_title[addonv2.l[i][1]]
                ])
                print t, ' ', k
                q.append(t)
                flag = 1
        if flag == 0:
            if int(addonv2.l[i][0]) == 1:
                t = list()
                t.extend([addonv2.l[i][0], addonv2.l[i][1], ""])
                q.append(t)

    for i in range(0, len(q) - 1):
        if q[i][2] == "":
            q[i][2] = q[i + 1][2]

    for i in range(0, len(q)):
        q[i][1] = to_unicode(q[i][1]).strip()
        q[i][1] = re.sub(u"(\u2018|\u2019|\u201c|\u201d)", "", q[i][1])
        q[i][1] = re.sub(u"\xa0", " ", q[i][1])
        print q[i], i

    print len(q)

    gFunc2(q)
    get_data(badtags)
Beispiel #27
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"), Tokenizer("czech"))
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo.")
    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
Beispiel #29
0
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Beispiel #30
0
 def to_sentences(self, paragraph):
     if hasattr(self._sentence_tokenizer, '_params'):
         extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get(
             self._language, [])
         self._sentence_tokenizer._params.abbrev_types.update(
             extra_abbreviations)
     sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph))
     return tuple(map(unicode.strip, sentences))
Beispiel #31
0
    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
Beispiel #32
0
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Beispiel #33
0
    def test_key_2(self):
        document = build_document((
            "Om nom nom nom nom",
            "Sure I summarize it, with bonus",
        ), ("This is bonus test sentence with some extra words and bonus", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "nom",
            "bonus",
        )

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(
            to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")
Beispiel #34
0
def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."
Beispiel #35
0
    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "This is only one sentence.")
Beispiel #36
0
def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."
Beispiel #37
0
    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "This is only one sentence.")
Beispiel #38
0
    def test_key_no_bonus_words_in_document(self):
        document = build_document((
            "wa wb wc wd",
            "I like music",
        ), ("This is test sentence with some extra words", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
            "bonus",
        )

        sentences = summarizer.key_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd")
        self.assertEqual(to_unicode(sentences[1]), "I like music")
        self.assertEqual(to_unicode(sentences[2]),
                         "This is test sentence with some extra words")
Beispiel #39
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
            "Přerostly až v reparát z jazyka na konci školního roku. "
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
            "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
            "o rok mladších dětí budoval vedoucí pozici. "
            "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(stem_word)
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.")
        self.assertEqual(to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo.")
Beispiel #40
0
def test_annotated_text():
    path = expand_resource_path("snippets/paragraphs.html")
    url = "http://www.snippet.org/paragraphs.html"
    parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 1
    assert len(document.paragraphs[0].sentences) == 1

    assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne"
    assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn."

    assert len(document.paragraphs[1].headings) == 0
    assert len(document.paragraphs[1].sentences) == 2

    assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru."
    assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."
Beispiel #41
0
    def test_title_method_without_title(self):
        document = build_document((
            "This is sentence",
            "This is another one",
        ), ("And some next sentence but no heading", ))

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "this",
            "is",
            "some",
            "and",
        )

        sentences = summarizer.title_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "This is sentence")
        self.assertEqual(to_unicode(sentences[1]), "This is another one")
        self.assertEqual(to_unicode(sentences[2]),
                         "And some next sentence but no heading")
Beispiel #42
0
    def test_title_method_2(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("this", "is", "I", "am", "and",)

        sentences = summarizer.title_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "This is next paragraph because of blank line above")
        self.assertEqual(to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
Beispiel #43
0
    def test_cue_2(self):
        document = build_document(
            ("ba bb bc bb unknown ľščťžýáíé sb sc sb",),
            ("Pepek likes spinach",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
Beispiel #44
0
    def test_headings(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.headings), 1)
        self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka")
Beispiel #45
0
    def test_headings(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.headings), 1)
        self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka")
Beispiel #46
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo."
        )
Beispiel #47
0
    def test_cue_letters_case(self):
        document = build_document((
            "X X X",
            "x x x x",
        ), (
            "w w w",
            "W W W W",
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "X",
            "w",
        )
        summarizer.stigma_words = ("stigma", )

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")
Beispiel #48
0
    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = GraphSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
Beispiel #49
0
def test_three_sentences_but_second_winner():
    document = build_document([
        "I am that 1. sentence",
        "And I am 2. sentence - winning sentence",
        "And I am 3. sentence - winner is my 2nd name",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    returned = summarizer(document, 1)
    assert len(returned) == 1
    assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
Beispiel #51
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Beispiel #52
0
    def test_key_3(self):
        document = build_document((
            "wa",
            "wa wa",
            "wa wa wa",
            "wa wa wa wa",
            "wa Wa Wa Wa wa",
        ), ("x X x X", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "wa",
            "X",
        )

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")
Beispiel #53
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s", )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
Beispiel #54
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Beispiel #55
0
    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1,
            title_weight=0, location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because")
        summarizer.stigma_words = ("this", "is", "I", "am", "and",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "Because I am sentence I like words")
        self.assertEqual(to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
Beispiel #56
0
    def test_real_example(self):
        parser = PlaintextParser.from_string(
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
            "Přerostly až v reparát z jazyka na konci školního roku. "
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
            "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
            "o rok mladších dětí budoval vedoucí pozici. "
            "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
            Tokenizer("czech"))
        summarizer = LuhnSummarizer(stem_word)
        summarizer.stop_words = get_stop_words("czech")

        returned = summarizer(parser.document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(
            to_unicode(returned[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(returned[1]),
            "Připadal si, že je mezi malými dětmi a realizoval se tím, "
            "že si ve třídě o rok mladších dětí budoval vedoucí pozici.")
Beispiel #57
0
    def test_key_3(self):
        document = build_document(
            ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",),
            ("x X x X",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("wa", "X",)

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")
Beispiel #58
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s",)

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
 def test_to_unicode(self):
     returned = compat.to_unicode(self.o)
     self.assertStringsEqual(UNICODE_STRING, returned)