Example #1
0
    def test_multiple_line_break(self):
        html_string = (
            '<html><body>'
            '  normal text   <br><br> another   text  '
            '</body></html>'
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        assert html_string == returned

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 2

        self.assert_paragraphs_equal(
            paragraphs[0],
            text="normal text",
            words_count=2,
            tags_count=0
        )
        self.assert_paragraphs_equal(
            paragraphs[1],
            text="another text",
            words_count=2,
            tags_count=0
        )
Example #2
0
    def test_basic(self):
        html_string = (
            '<html><body>'
            '<h1>Header</h1>'
            '<p>text and some <em>other</em> words <span class="class">that I</span> have in my head now</p>'
            '<p>footer</p>'
            '</body></html>')
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 3)

        self.assert_paragraphs_equal(paragraphs[0],
                                     text="Header",
                                     words_count=1,
                                     tags_count=0)

        text = "text and some other words that I have in my head now"
        self.assert_paragraphs_equal(paragraphs[1],
                                     text=text,
                                     words_count=12,
                                     tags_count=2)

        self.assert_paragraphs_equal(paragraphs[2],
                                     text="footer",
                                     words_count=1,
                                     tags_count=0)
Example #3
0
    def test_whitespace_handling(self):
        html_string = (
            '<html><body>'
            '<p>pre<em>in</em>post \t pre  <span class="class"> in </span>  post</p>'
            '<div>pre<em> in </em>post</div>'
            '<pre>pre<em>in </em>post</pre>'
            '<blockquote>pre<em> in</em>post</blockquote>'
            '</body></html>')
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 4)

        self.assert_paragraphs_equal(paragraphs[0],
                                     text="preinpost pre in post",
                                     words_count=4,
                                     tags_count=2)

        self.assert_paragraphs_equal(paragraphs[1],
                                     text="pre in post",
                                     words_count=3,
                                     tags_count=1)

        self.assert_paragraphs_equal(paragraphs[2],
                                     text="prein post",
                                     words_count=2,
                                     tags_count=1)

        self.assert_paragraphs_equal(paragraphs[3],
                                     text="pre inpost",
                                     words_count=2,
                                     tags_count=1)
Example #4
0
def custom_justext(tree, stoplist):
    'Customized version of JusText processing'
    dom = preprocessor(tree) # tree_cleaning(tree, True)
    paragraphs = ParagraphMaker.make_paragraphs(dom)
    classify_paragraphs(paragraphs, stoplist, 50, 200, 0.1, 0.2, 0.2, True)
    revise_paragraph_classification(paragraphs, 200)
    return paragraphs
Example #5
0
    def test_whitespace_handling(self):
        html_string = (
            '<html><body>'
            '<p>pre<em>in</em>post \t pre  <span class="class"> in </span>  post</p>'
            '<div>pre<em> in </em>post</div>'
            '<pre>pre<em>in </em>post</pre>'
            '<blockquote>pre<em> in</em>post</blockquote>'
            '</body></html>'
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 4)

        self.assert_paragraphs_equal(paragraphs[0], text="preinpost pre in post",
            words_count=4, tags_count=2)

        self.assert_paragraphs_equal(paragraphs[1], text="pre in post",
            words_count=3, tags_count=1)

        self.assert_paragraphs_equal(paragraphs[2], text="prein post",
            words_count=2, tags_count=1)

        self.assert_paragraphs_equal(paragraphs[3], text="pre inpost",
            words_count=2, tags_count=1)
Example #6
0
    def test_links(self):
        """Inline text should be treated as separate paragraph."""
        html_string = (
            '<html><body>'
            '<a>I am <strong>top</strong>-inline\n\n\n\n and I am happy \n</a>'
            '<p>normal text</p>'
            '<code>\nvar i = -INFINITY;\n</code>'
            '<div>after <a>text</a> with variable <var>N</var> </div>'
            '   I am inline\n\n\n\n and I am happy \n'
            '</body></html>'
        )
        dom = html.fromstring(html_string)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 5)

        self.assert_paragraphs_equal(paragraphs[0], words_count=7, tags_count=2,
            text="I am top-inline and I am happy", chars_count_in_links=31)

        self.assert_paragraphs_equal(paragraphs[1], words_count=2, tags_count=0,
            text="normal text")

        self.assert_paragraphs_equal(paragraphs[2], words_count=4, tags_count=1,
            text="var i = -INFINITY;")

        self.assert_paragraphs_equal(paragraphs[3], words_count=5, tags_count=2,
            text="after text with variable N", chars_count_in_links=4)

        self.assert_paragraphs_equal(paragraphs[4], words_count=7, tags_count=0,
            text="I am inline and I am happy")
Example #7
0
    def test_no_paragraphs(self):
        html_string = '<html><body></body></html>'
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        assert html_string == returned

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 0
Example #8
0
    def test_no_paragraphs(self):
        html_string = '<html><body></body></html>'
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 0)
Example #9
0
    def test_no_paragraphs(self):
        html_string = '<html><body></body></html>'
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 0)
Example #10
0
    def test_links(self):
        """Inline text should be treated as separate paragraph."""
        html_string = (
            '<html><body>'
            '<a>I am <strong>top</strong>-inline\n\n\n\n and I am happy \n</a>'
            '<p>normal text</p>'
            '<code>\nvar i = -INFINITY;\n</code>'
            '<div>after <a>text</a> with variable <var>N</var> </div>'
            '   I am inline\n\n\n\n and I am happy \n'
            '</body></html>'
        )
        dom = html.fromstring(html_string)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 5

        self.assert_paragraphs_equal(
            paragraphs[0],
            words_count=7,
            tags_count=2,
            text="I am top-inline\nand I am happy",
            chars_count_in_links=31
        )
        self.assert_paragraphs_equal(
            paragraphs[1],
            words_count=2,
            tags_count=0,
            text="normal text"
        )
        self.assert_paragraphs_equal(
            paragraphs[2],
            words_count=4,
            tags_count=1,
            text="var i = -INFINITY;"
        )
        self.assert_paragraphs_equal(
            paragraphs[3],
            words_count=5,
            tags_count=2,
            text="after text with variable N",
            chars_count_in_links=4
        )
        self.assert_paragraphs_equal(
            paragraphs[4],
            words_count=7,
            tags_count=0,
            text="I am inline\nand I am happy"
        )
Example #11
0
    def test_multiple_line_break(self):
        html_string = (
            '<html><body>'
            '  normal text   <br><br> another   text  '
            '</body></html>'
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 2)

        self.assert_paragraphs_equal(paragraphs[0], text="normal text",
            words_count=2, tags_count=0)

        self.assert_paragraphs_equal(paragraphs[1], text="another text",
            words_count=2, tags_count=0)
Example #12
0
    def test_basic(self):
        html_string = (
            '<html><body>'
            '<h1>Header</h1>'
            '<p>text and some <em>other</em> words <span class="class">that I</span> have in my head now</p>'
            '<p>footer</p>'
            '</body></html>'
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        tools.assert_equal(len(paragraphs), 3)

        self.assert_paragraphs_equal(paragraphs[0], text="Header", words_count=1, tags_count=0)

        text = "text and some other words that I have in my head now"
        self.assert_paragraphs_equal(paragraphs[1], text=text, words_count=12, tags_count=2)

        self.assert_paragraphs_equal(paragraphs[2], text="footer", words_count=1, tags_count=0)