def custom_justext(tree, stoplist): 'Customized version of JusText processing' dom = preprocessor(tree) # tree_cleaning(tree, True) paragraphs = ParagraphMaker.make_paragraphs(dom) classify_paragraphs(paragraphs, stoplist, 50, 200, 0.1, 0.2, 0.2, True) revise_paragraph_classification(paragraphs, 200) return paragraphs
def test_remove_comments(self): dom = html.fromstring( "<html><!-- comment --><body>" "<h1>Header</h1>" "<!-- comment --> text" "<p>footer" "</body></html>" ) expected = "<html><!-- comment --><body><h1>Header</h1><!-- comment --> text<p>footer</p></body></html>" returned = html.tostring(dom).decode("utf8") tools.assert_equal(expected, returned) dom = preprocessor(dom) expected = "<html><body><h1>Header</h1> text<p>footer</p></body></html>" returned = html.tostring(dom).decode("utf8") tools.assert_equal(expected, returned)
def test_preprocess_simple_unicode_string(self): html_string = ('<html><head><title>Title</title></head><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>') dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ('<html><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>') tools.assert_equal(expected, returned)
def test_remove_comments(self): dom = html.fromstring('<html><!-- comment --><body>' '<h1>Header</h1>' '<!-- comment --> text' '<p>footer' '</body></html>') expected = '<html><!-- comment --><body><h1>Header</h1><!-- comment --> text<p>footer</p></body></html>' returned = html.tostring(dom).decode("utf8") tools.assert_equal(expected, returned) dom = preprocessor(dom) expected = '<html><body><h1>Header</h1> text<p>footer</p></body></html>' returned = html.tostring(dom).decode("utf8") tools.assert_equal(expected, returned)
def test_preprocess_simple_bytes_string(): html_string = (b'<html><head><title>Title</title></head><body>' b'<h1>Header</h1>' b'<p>pre<span>text</span>post<em>emph</em>popost</p>' b'<p>footer <em>like</em> a boss</p>' b' <!-- abcdefgh -->\n' b'</body></html>') dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ('<html><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' ' \n' '</body></html>') assert expected == returned
def test_remove_comments(): dom = html.fromstring( '<html><!-- comment --><body>' '<h1>Header</h1>' '<!-- comment --> text' '<p>footer' '</body></html>' ) expected = '<html><!-- comment --><body><h1>Header</h1><!-- comment --> text<p>footer</p></body></html>' returned = html.tostring(dom).decode("utf8") assert expected == returned dom = preprocessor(dom) expected = '<html><body><h1>Header</h1> text<p>footer</p></body></html>' returned = html.tostring(dom).decode("utf8") assert expected == returned
def test_remove_head_tag(self): html_string = ('<html><head><title>Title</title></head><body>' '<h1>Header</h1>' '<p><span>text</span></p>' '<p>footer <em>like</em> a boss</p>' '</body></html>') dom = html.fromstring(html_string) returned = html.tostring(dom).decode("utf8") tools.assert_equal(html_string, returned) dom = preprocessor(dom) returned = html.tostring(dom).decode("utf8") expected = ('<html><body>' '<h1>Header</h1>' '<p><span>text</span></p>' '<p>footer <em>like</em> a boss</p>' '</body></html>') tools.assert_equal(expected, returned)
def test_preprocess_simple_unicode_string(self): html_string = ( "<html><head><title>Title</title></head><body>" "<h1>Header</h1>" "<p>pre<span>text</span>post<em>emph</em>popost</p>" "<p>footer <em>like</em> a boss</p>" "</body></html>" ) dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( "<html><body>" "<h1>Header</h1>" "<p>pre<span>text</span>post<em>emph</em>popost</p>" "<p>footer <em>like</em> a boss</p>" "</body></html>" ) tools.assert_equal(expected, returned)
def test_preprocess_simple_unicode_string(): html_string = ( '<html><head><title>Title</title></head><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>' ) dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( '<html><body>' '<h1>Header</h1>' '<p>pre<span>text</span>post<em>emph</em>popost</p>' '<p>footer <em>like</em> a boss</p>' '</body></html>' ) assert expected == returned
def test_preprocess_simple_unicode_xhtml_string_with_declaration(self): html_string = ( '<?xml version="1.0" encoding="windows-1250"?>' '<!DOCTYPE html>' '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' '<head>' '<title>Hello World</title>' '<meta http-equiv="imagetoolbar" content="no" />' '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />' '</head>' '<body id="index">' '</body>' '</html>') dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' '<body id="index">' '</body>' '</html>') tools.assert_equal(expected, returned)
def test_remove_head_tag(self): html_string = ( "<html><head><title>Title</title></head><body>" "<h1>Header</h1>" "<p><span>text</span></p>" "<p>footer <em>like</em> a boss</p>" "</body></html>" ) dom = html.fromstring(html_string) returned = html.tostring(dom).decode("utf8") tools.assert_equal(html_string, returned) dom = preprocessor(dom) returned = html.tostring(dom).decode("utf8") expected = ( "<html><body>" "<h1>Header</h1>" "<p><span>text</span></p>" "<p>footer <em>like</em> a boss</p>" "</body></html>" ) tools.assert_equal(expected, returned)
def test_remove_head_tag(): html_string = ( '<html><head><title>Title</title></head><body>' '<h1>Header</h1>' '<p><span>text</span></p>' '<p>footer <em>like</em> a boss</p>' '</body></html>' ) dom = html.fromstring(html_string) returned = html.tostring(dom).decode("utf8") assert html_string == returned dom = preprocessor(dom) returned = html.tostring(dom).decode("utf8") expected = ( '<html><body>' '<h1>Header</h1>' '<p><span>text</span></p>' '<p>footer <em>like</em> a boss</p>' '</body></html>' ) assert expected == returned
def test_preprocess_simple_unicode_xhtml_string_with_declaration(self): html_string = ( '<?xml version="1.0" encoding="windows-1250"?>' "<!DOCTYPE html>" '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' "<head>" "<title>Hello World</title>" '<meta http-equiv="imagetoolbar" content="no" />' '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />' "</head>" '<body id="index">' "</body>" "</html>" ) dom = preprocessor(html_to_dom(html_string)) returned = html.tostring(dom).decode("utf8") expected = ( '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">' '<body id="index">' "</body>" "</html>" ) tools.assert_equal(expected, returned)