Example #1
0
def test_lxml_do_not_hold_context_from_previous_parsing():
    """
    https://github.com/miso-belica/jusText/issues/17
    """
    html_to_dom("<justext></justext>")

    with pytest.raises(lxml.etree.XMLSyntaxError) as e:
        html_to_dom("")

    assert "justext" not in str(e.value)
Example #2
0
def test_lxml_do_not_hold_context_from_previous_parsing():
    """
    https://github.com/miso-belica/jusText/issues/17
    """
    html_to_dom("<justext></justext>")

    with pytest.raises(lxml.etree.XMLSyntaxError) as e:
        html_to_dom("")

    assert "justext" not in str(e.value)
Example #3
0
    def test_preprocess_simple_unicode_string(self):
        html_string = ('<html><head><title>Title</title></head><body>'
                       '<h1>Header</h1>'
                       '<p>pre<span>text</span>post<em>emph</em>popost</p>'
                       '<p>footer <em>like</em> a boss</p>'
                       '</body></html>')

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = ('<html><body>'
                    '<h1>Header</h1>'
                    '<p>pre<span>text</span>post<em>emph</em>popost</p>'
                    '<p>footer <em>like</em> a boss</p>'
                    '</body></html>')
        tools.assert_equal(expected, returned)
Example #4
0
def test_preprocess_simple_bytes_string():
    html_string = (b'<html><head><title>Title</title></head><body>'
                   b'<h1>Header</h1>'
                   b'<p>pre<span>text</span>post<em>emph</em>popost</p>'
                   b'<p>footer <em>like</em> a boss</p>'
                   b'  <!-- abcdefgh -->\n'
                   b'</body></html>')

    dom = preprocessor(html_to_dom(html_string))
    returned = html.tostring(dom).decode("utf8")
    expected = ('<html><body>'
                '<h1>Header</h1>'
                '<p>pre<span>text</span>post<em>emph</em>popost</p>'
                '<p>footer <em>like</em> a boss</p>'
                '  \n'
                '</body></html>')
    assert expected == returned
Example #5
0
    def test_preprocess_simple_unicode_string(self):
        html_string = (
            "<html><head><title>Title</title></head><body>"
            "<h1>Header</h1>"
            "<p>pre<span>text</span>post<em>emph</em>popost</p>"
            "<p>footer <em>like</em> a boss</p>"
            "</body></html>"
        )

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = (
            "<html><body>"
            "<h1>Header</h1>"
            "<p>pre<span>text</span>post<em>emph</em>popost</p>"
            "<p>footer <em>like</em> a boss</p>"
            "</body></html>"
        )
        tools.assert_equal(expected, returned)
Example #6
0
def test_preprocess_simple_unicode_string():
    html_string = (
        '<html><head><title>Title</title></head><body>'
        '<h1>Header</h1>'
        '<p>pre<span>text</span>post<em>emph</em>popost</p>'
        '<p>footer <em>like</em> a boss</p>'
        '</body></html>'
    )

    dom = preprocessor(html_to_dom(html_string))
    returned = html.tostring(dom).decode("utf8")
    expected = (
        '<html><body>'
        '<h1>Header</h1>'
        '<p>pre<span>text</span>post<em>emph</em>popost</p>'
        '<p>footer <em>like</em> a boss</p>'
        '</body></html>'
    )
    assert expected == returned
Example #7
0
    def test_preprocess_simple_unicode_xhtml_string_with_declaration(self):
        html_string = (
            '<?xml version="1.0" encoding="windows-1250"?>'
            '<!DOCTYPE html>'
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            '<head>'
            '<title>Hello World</title>'
            '<meta http-equiv="imagetoolbar" content="no" />'
            '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />'
            '</head>'
            '<body id="index">'
            '</body>'
            '</html>')

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = (
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            '<body id="index">'
            '</body>'
            '</html>')
        tools.assert_equal(expected, returned)
Example #8
0
    def test_preprocess_simple_unicode_xhtml_string_with_declaration(self):
        html_string = (
            '<?xml version="1.0" encoding="windows-1250"?>'
            "<!DOCTYPE html>"
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            "<head>"
            "<title>Hello World</title>"
            '<meta http-equiv="imagetoolbar" content="no" />'
            '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />'
            "</head>"
            '<body id="index">'
            "</body>"
            "</html>"
        )

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = (
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            '<body id="index">'
            "</body>"
            "</html>"
        )
        tools.assert_equal(expected, returned)
Example #9
0
 def test_accepts_html_tree(self):
     try:
         justext(html_to_dom(html_string), get_stoplist("English"))
     except JustextMalformedInput:
         self.fail("justext wasn't supposed to raise JustextMalformedInput exception on string")