def test_HtmlTool_clean_html(): """Test HTML cleanup algorithm.""" from clair.textprocessing import HtmlTool html_nice = HtmlTool.clean_html("""<head></head> <body>Foo</body>""") print html_nice html_nice = HtmlTool.clean_html(html_nasty) # print html_nasty print html_nice pytest.xfail("LXML does strange things.") #IGNORE:E1101 assert html_nice.find("Blitzgerät") != -1
def test_HtmlTool_to_nice_text(): """Test HTML cleanup algorithm.""" from clair.textprocessing import HtmlTool text_nice = HtmlTool.to_nice_text(html_nasty) print html_nasty print text_nice assert text_nice.find(u"Blitzgerät") != -1 assert text_nice.find(u"Rückseite") != -1 assert len(text_nice.split("\n")) == 8
def test_HtmlTool_remove_html(): """Test the HTML to pure text conversion.""" from clair.textprocessing import HtmlTool text = HtmlTool.remove_html( "This is <b>bold</b> text. <p>Paragraph.</p> 3 > 2") print text assert text == "This is bold text. Paragraph. 3 > 2" text = HtmlTool.remove_html( """Integrated style sheet. <style type="text/css"> p {color:blue;} </style> Text after style sheet.""") print text assert text == "Integrated style sheet. Text after style sheet." assert HtmlTool.remove_html(None) == "" assert HtmlTool.remove_html(nan) == "" text = HtmlTool.remove_html(" 1 < 2 > 0.5 ") print text assert text == " 1 < 2 > 0.5 "