def test_lxml_do_not_hold_context_from_previous_parsing(): """ https://github.com/miso-belica/jusText/issues/17 """ html_to_dom("<justext></justext>") with pytest.raises(lxml.etree.XMLSyntaxError) as e: html_to_dom("") assert "justext" not in str(e.value)
def test_preprocess_simple_unicode_string(self): html_string = ('<html><head><title>Title</title></head><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>') dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ('<html><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>') tools.assert_equal(expected, returned)
def test_preprocess_simple_bytes_string(): html_string = (b'<html><head><title>Title</title></head><body>' b'<h1>Header</h1>' b'<p>pre<span>text</span>post<em>emph</em>popost</p>' b'<p>footer <em>like</em> a boss</p>' b' <!-- abcdefgh -->\n' b'</body></html>') dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ('<html><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' ' \n' '</body></html>') assert expected == returned
def test_preprocess_simple_unicode_string(self): html_string = ( "<html><head><title>Title</title></head><body>" "<h1>Header</h1>" "<p>pre<span>text</span>post<em>emph</em>popost</p>" "<p>footer <em>like</em> a boss</p>" "</body></html>" ) dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( "<html><body>" "<h1>Header</h1>" "<p>pre<span>text</span>post<em>emph</em>popost</p>" "<p>footer <em>like</em> a boss</p>" "</body></html>" ) tools.assert_equal(expected, returned)
def test_preprocess_simple_unicode_string(): html_string = ( '<html><head><title>Title</title></head><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>' ) dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( '<html><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>' ) assert expected == returned
def test_preprocess_simple_unicode_xhtml_string_with_declaration(self): html_string = ( '<?xml version="1.0" encoding="windows-1250"?>' '<!DOCTYPE html>' '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' '<head>' '<title>Hello World</title>' '<meta http-equiv="imagetoolbar" content="no" />' '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />' '</head>' '<body id="index">' '</body>' '</html>') dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' '<body id="index">' '</body>' '</html>') tools.assert_equal(expected, returned)
def test_preprocess_simple_unicode_xhtml_string_with_declaration(self): html_string = ( '<?xml version="1.0" encoding="windows-1250"?>' "<!DOCTYPE html>" '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' "<head>" "<title>Hello World</title>" '<meta http-equiv="imagetoolbar" content="no" />' '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />' "</head>" '<body id="index">' "</body>" "</html>" ) dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' '<body id="index">' "</body>" "</html>" ) tools.assert_equal(expected, returned)
def test_accepts_html_tree(self): try: justext(html_to_dom(html_string), get_stoplist("English")) except JustextMalformedInput: self.fail("justext wasn't supposed to raise JustextMalformedInput exception on string")